1
0
mirror of https://git.FreeBSD.org/ports.git synced 2024-12-24 04:33:24 +00:00

- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.

http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7

- Fix a buffer overflow in CJKAnalyzer.  Somehow the upstream missed this
in 2.3.3.4 branch.

http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630

- Fix potential memory leaks in libstemmer.  Merged from Snowball changes.

http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch

- Implement SnowballAnalyzer::reusableTokenStream(). [1]  Also, this patch
fixes memory leaks found by the submitter.

Submitted by:	Kishore Ramareddy (kishore at niksun dot com)
		(initial version) [1]
Feature safe:	yes
This commit is contained in:
Jung-uk Kim 2013-04-16 18:37:03 +00:00
parent f611dec043
commit fa023c7e1a
Notes: svn2git 2021-03-31 03:12:20 +00:00
svn path=/head/; revision=315883
7 changed files with 206 additions and 1 deletions

View File

@ -3,7 +3,7 @@
PORTNAME= clucene PORTNAME= clucene
PORTVERSION= 2.3.3.4 PORTVERSION= 2.3.3.4
PORTREVISION= 1 PORTREVISION= 2
CATEGORIES= textproc CATEGORIES= textproc
MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3 MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
DISTNAME= ${PORTNAME}-core-${PORTVERSION} DISTNAME= ${PORTNAME}-core-${PORTVERSION}

View File

@ -0,0 +1,11 @@
--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h 2013-03-29 18:46:22.000000000 -0400
@@ -39,7 +39,7 @@
* character buffer, store the characters which are used to compose <br>
* the returned Token
*/
- TCHAR buffer[LUCENE_MAX_WORD_LEN];
+ TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
/**
* I/O buffer, used to store the content of the input(one of the <br>

View File

@ -0,0 +1,74 @@
--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400
@@ -19,16 +19,31 @@
CL_NS_DEF2(analysis,snowball)
+ class SnowballAnalyzer::SavedStreams : public TokenStream {
+ public:
+ StandardTokenizer* tokenStream;
+ TokenStream* filteredTokenStream;
+
+ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
+ void close(){}
+ Token* next(Token* token) {return NULL;}
+ };
+
/** Builds the named analyzer with no stop words. */
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
this->language = STRDUP_TtoT(language);
stopSet = NULL;
}
- SnowballAnalyzer::~SnowballAnalyzer(){
- _CLDELETE_CARRAY(language);
- if ( stopSet != NULL )
- _CLDELETE(stopSet);
+ SnowballAnalyzer::~SnowballAnalyzer() {
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+ if (streams != NULL) {
+ _CLDELETE(streams->filteredTokenStream);
+ _CLDELETE(streams);
+ }
+ _CLDELETE_CARRAY(language);
+ if (stopSet != NULL)
+ _CLDELETE(stopSet);
}
/** Builds the named analyzer with the given stop words.
@@ -62,12 +77,29 @@
result = _CLNEW SnowballFilter(result, language, true);
return result;
}
-
-
-
-
-
-
+
+ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+
+ if (streams == NULL) {
+ streams = _CLNEW SavedStreams();
+ BufferedReader* bufferedReader = reader->__asBufferedReader();
+
+ if (bufferedReader == NULL)
+ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
+ else
+ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
+
+ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
+ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
+ if (stopSet != NULL)
+ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
+ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
+ setPreviousTokenStream(streams);
+ } else
+ streams->tokenStream->reset(reader);
+ return streams->filteredTokenStream;
+ }
/** Construct the named stemming filter.
*

View File

@ -0,0 +1,19 @@
--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h 2013-04-01 18:25:10.000000000 -0400
@@ -22,6 +22,7 @@
class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
TCHAR* language;
CLTCSetList* stopSet;
+ class SavedStreams;
public:
/** Builds the named analyzer with no stop words. */
@@ -37,6 +38,8 @@
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
+
+ TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
};
CL_NS_END2

View File

@ -0,0 +1,24 @@
--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c 2013-03-29 18:54:39.000000000 -0400
@@ -35,9 +35,8 @@
{
stemmer_encoding enc;
struct stemmer_modules * module;
- struct sb_stemmer * stemmer =
- (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
- if (stemmer == NULL) return NULL;
+ struct sb_stemmer * stemmer;
+
enc = sb_getenc(charenc);
if (enc == ENC_UNKNOWN) return NULL;
@@ -46,6 +45,9 @@
}
if (module->name == NULL) return NULL;
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+ if (stemmer == NULL) return NULL;
+
stemmer->create = module->create;
stemmer->close = module->close;
stemmer->stem = module->stem;

View File

@ -0,0 +1,67 @@
--- src/core/CLucene/util/BitSet.cpp.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/core/CLucene/util/BitSet.cpp 2013-03-29 17:57:05.000000000 -0400
@@ -32,6 +32,25 @@
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+const uint8_t BitSet::BYTE_OFFSETS[256] = {
+ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+
BitSet::BitSet( const BitSet& copy ) :
_size( copy._size ),
_count(-1)
@@ -180,19 +199,32 @@
return factor * (4 + (8+40)*count()) < size();
}
- int32_t BitSet::nextSetBit(int32_t fromIndex) const {
+ int32_t BitSet::nextSetBit(int32_t fromIndex) const
+ {
if (fromIndex < 0)
_CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
if (fromIndex >= _size)
return -1;
- while (true) {
- if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
- return fromIndex;
- if (++fromIndex == _size)
- return -1;
+ int _max = ( _size+7 ) >> 3;
+
+ unsigned int i = (int)( fromIndex>>3 );
+ unsigned int subIndex = fromIndex & 0x7; // index within the byte
+ uint8_t byte = bits[i] >> subIndex; // skip all the bits to the right of index
+
+ if ( byte != 0 )
+ {
+ return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
+ }
+
+ while( ++i < _max )
+ {
+ byte = bits[i];
+ if ( byte != 0 )
+ return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
}
+ return -1;
}
CL_NS_END

View File

@ -0,0 +1,10 @@
--- src/core/CLucene/util/BitSet.h.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/core/CLucene/util/BitSet.h 2013-03-29 17:57:05.000000000 -0400
@@ -39,6 +39,7 @@
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
bool isSparse();
static const uint8_t BYTE_COUNTS[256];
+ static const uint8_t BYTE_OFFSETS[256];
protected:
BitSet( const BitSet& copy );