mirror of
https://git.FreeBSD.org/ports.git
synced 2024-12-19 03:52:17 +00:00
- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7 - Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this in 2.3.3.4 branch. http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630 - Fix potential memory leaks in libstemmer. Merged from Snowball changes. http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch - Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch fixes memory leaks found by the submitter. Submitted by: Kishore Ramareddy (kishore at niksun dot com) (initial version) [1] Feature safe: yes
This commit is contained in:
parent
f611dec043
commit
fa023c7e1a
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=315883
@ -3,7 +3,7 @@
|
||||
|
||||
PORTNAME= clucene
|
||||
PORTVERSION= 2.3.3.4
|
||||
PORTREVISION= 1
|
||||
PORTREVISION= 2
|
||||
CATEGORIES= textproc
|
||||
MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
|
||||
DISTNAME= ${PORTNAME}-core-${PORTVERSION}
|
||||
|
@ -0,0 +1,11 @@
|
||||
--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h 2013-03-29 18:46:22.000000000 -0400
|
||||
@@ -39,7 +39,7 @@
|
||||
* character buffer, store the characters which are used to compose <br>
|
||||
* the returned Token
|
||||
*/
|
||||
- TCHAR buffer[LUCENE_MAX_WORD_LEN];
|
||||
+ TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
|
||||
|
||||
/**
|
||||
* I/O buffer, used to store the content of the input(one of the <br>
|
@ -0,0 +1,74 @@
|
||||
--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400
|
||||
@@ -19,16 +19,31 @@
|
||||
|
||||
CL_NS_DEF2(analysis,snowball)
|
||||
|
||||
+ class SnowballAnalyzer::SavedStreams : public TokenStream {
|
||||
+ public:
|
||||
+ StandardTokenizer* tokenStream;
|
||||
+ TokenStream* filteredTokenStream;
|
||||
+
|
||||
+ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
|
||||
+ void close(){}
|
||||
+ Token* next(Token* token) {return NULL;}
|
||||
+ };
|
||||
+
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
|
||||
this->language = STRDUP_TtoT(language);
|
||||
stopSet = NULL;
|
||||
}
|
||||
|
||||
- SnowballAnalyzer::~SnowballAnalyzer(){
|
||||
- _CLDELETE_CARRAY(language);
|
||||
- if ( stopSet != NULL )
|
||||
- _CLDELETE(stopSet);
|
||||
+ SnowballAnalyzer::~SnowballAnalyzer() {
|
||||
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
|
||||
+ if (streams != NULL) {
|
||||
+ _CLDELETE(streams->filteredTokenStream);
|
||||
+ _CLDELETE(streams);
|
||||
+ }
|
||||
+ _CLDELETE_CARRAY(language);
|
||||
+ if (stopSet != NULL)
|
||||
+ _CLDELETE(stopSet);
|
||||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words.
|
||||
@@ -62,12 +77,29 @@
|
||||
result = _CLNEW SnowballFilter(result, language, true);
|
||||
return result;
|
||||
}
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
+
|
||||
+ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
|
||||
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
|
||||
+
|
||||
+ if (streams == NULL) {
|
||||
+ streams = _CLNEW SavedStreams();
|
||||
+ BufferedReader* bufferedReader = reader->__asBufferedReader();
|
||||
+
|
||||
+ if (bufferedReader == NULL)
|
||||
+ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
|
||||
+ else
|
||||
+ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
|
||||
+
|
||||
+ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
|
||||
+ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
|
||||
+ if (stopSet != NULL)
|
||||
+ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
|
||||
+ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
|
||||
+ setPreviousTokenStream(streams);
|
||||
+ } else
|
||||
+ streams->tokenStream->reset(reader);
|
||||
+ return streams->filteredTokenStream;
|
||||
+ }
|
||||
|
||||
/** Construct the named stemming filter.
|
||||
*
|
@ -0,0 +1,19 @@
|
||||
--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h 2013-04-01 18:25:10.000000000 -0400
|
||||
@@ -22,6 +22,7 @@
|
||||
class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
|
||||
TCHAR* language;
|
||||
CLTCSetList* stopSet;
|
||||
+ class SavedStreams;
|
||||
|
||||
public:
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
@@ -37,6 +38,8 @@
|
||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
|
||||
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
|
||||
+
|
||||
+ TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
|
||||
};
|
||||
|
||||
CL_NS_END2
|
@ -0,0 +1,24 @@
|
||||
--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c 2013-03-29 18:54:39.000000000 -0400
|
||||
@@ -35,9 +35,8 @@
|
||||
{
|
||||
stemmer_encoding enc;
|
||||
struct stemmer_modules * module;
|
||||
- struct sb_stemmer * stemmer =
|
||||
- (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
||||
- if (stemmer == NULL) return NULL;
|
||||
+ struct sb_stemmer * stemmer;
|
||||
+
|
||||
enc = sb_getenc(charenc);
|
||||
if (enc == ENC_UNKNOWN) return NULL;
|
||||
|
||||
@@ -46,6 +45,9 @@
|
||||
}
|
||||
if (module->name == NULL) return NULL;
|
||||
|
||||
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
||||
+ if (stemmer == NULL) return NULL;
|
||||
+
|
||||
stemmer->create = module->create;
|
||||
stemmer->close = module->close;
|
||||
stemmer->stem = module->stem;
|
@ -0,0 +1,67 @@
|
||||
--- src/core/CLucene/util/BitSet.cpp.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/core/CLucene/util/BitSet.cpp 2013-03-29 17:57:05.000000000 -0400
|
||||
@@ -32,6 +32,25 @@
|
||||
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||
|
||||
+const uint8_t BitSet::BYTE_OFFSETS[256] = {
|
||||
+ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
|
||||
+
|
||||
+
|
||||
BitSet::BitSet( const BitSet& copy ) :
|
||||
_size( copy._size ),
|
||||
_count(-1)
|
||||
@@ -180,19 +199,32 @@
|
||||
return factor * (4 + (8+40)*count()) < size();
|
||||
}
|
||||
|
||||
- int32_t BitSet::nextSetBit(int32_t fromIndex) const {
|
||||
+ int32_t BitSet::nextSetBit(int32_t fromIndex) const
|
||||
+ {
|
||||
if (fromIndex < 0)
|
||||
_CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
|
||||
|
||||
if (fromIndex >= _size)
|
||||
return -1;
|
||||
|
||||
- while (true) {
|
||||
- if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
|
||||
- return fromIndex;
|
||||
- if (++fromIndex == _size)
|
||||
- return -1;
|
||||
+ int _max = ( _size+7 ) >> 3;
|
||||
+
|
||||
+ unsigned int i = (int)( fromIndex>>3 );
|
||||
+ unsigned int subIndex = fromIndex & 0x7; // index within the byte
|
||||
+ uint8_t byte = bits[i] >> subIndex; // skip all the bits to the right of index
|
||||
+
|
||||
+ if ( byte != 0 )
|
||||
+ {
|
||||
+ return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
|
||||
+ }
|
||||
+
|
||||
+ while( ++i < _max )
|
||||
+ {
|
||||
+ byte = bits[i];
|
||||
+ if ( byte != 0 )
|
||||
+ return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
|
||||
}
|
||||
+ return -1;
|
||||
}
|
||||
|
||||
CL_NS_END
|
@ -0,0 +1,10 @@
|
||||
--- src/core/CLucene/util/BitSet.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||
+++ src/core/CLucene/util/BitSet.h 2013-03-29 17:57:05.000000000 -0400
|
||||
@@ -39,6 +39,7 @@
|
||||
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
|
||||
bool isSparse();
|
||||
static const uint8_t BYTE_COUNTS[256];
|
||||
+ static const uint8_t BYTE_OFFSETS[256];
|
||||
protected:
|
||||
BitSet( const BitSet& copy );
|
||||
|
Loading…
Reference in New Issue
Block a user