mirror of
https://git.FreeBSD.org/ports.git
synced 2024-12-24 04:33:24 +00:00
- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7 - Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this in 2.3.3.4 branch. http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630 - Fix potential memory leaks in libstemmer. Merged from Snowball changes. http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch - Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch fixes memory leaks found by the submitter. Submitted by: Kishore Ramareddy (kishore at niksun dot com) (initial version) [1] Feature safe: yes
This commit is contained in:
parent
f611dec043
commit
fa023c7e1a
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=315883
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
PORTNAME= clucene
|
PORTNAME= clucene
|
||||||
PORTVERSION= 2.3.3.4
|
PORTVERSION= 2.3.3.4
|
||||||
PORTREVISION= 1
|
PORTREVISION= 2
|
||||||
CATEGORIES= textproc
|
CATEGORIES= textproc
|
||||||
MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
|
MASTER_SITES= SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
|
||||||
DISTNAME= ${PORTNAME}-core-${PORTVERSION}
|
DISTNAME= ${PORTNAME}-core-${PORTVERSION}
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h 2013-03-29 18:46:22.000000000 -0400
|
||||||
|
@@ -39,7 +39,7 @@
|
||||||
|
* character buffer, store the characters which are used to compose <br>
|
||||||
|
* the returned Token
|
||||||
|
*/
|
||||||
|
- TCHAR buffer[LUCENE_MAX_WORD_LEN];
|
||||||
|
+ TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* I/O buffer, used to store the content of the input(one of the <br>
|
@ -0,0 +1,74 @@
|
|||||||
|
--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400
|
||||||
|
@@ -19,16 +19,31 @@
|
||||||
|
|
||||||
|
CL_NS_DEF2(analysis,snowball)
|
||||||
|
|
||||||
|
+ class SnowballAnalyzer::SavedStreams : public TokenStream {
|
||||||
|
+ public:
|
||||||
|
+ StandardTokenizer* tokenStream;
|
||||||
|
+ TokenStream* filteredTokenStream;
|
||||||
|
+
|
||||||
|
+ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
|
||||||
|
+ void close(){}
|
||||||
|
+ Token* next(Token* token) {return NULL;}
|
||||||
|
+ };
|
||||||
|
+
|
||||||
|
/** Builds the named analyzer with no stop words. */
|
||||||
|
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
|
||||||
|
this->language = STRDUP_TtoT(language);
|
||||||
|
stopSet = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
- SnowballAnalyzer::~SnowballAnalyzer(){
|
||||||
|
- _CLDELETE_CARRAY(language);
|
||||||
|
- if ( stopSet != NULL )
|
||||||
|
- _CLDELETE(stopSet);
|
||||||
|
+ SnowballAnalyzer::~SnowballAnalyzer() {
|
||||||
|
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
|
||||||
|
+ if (streams != NULL) {
|
||||||
|
+ _CLDELETE(streams->filteredTokenStream);
|
||||||
|
+ _CLDELETE(streams);
|
||||||
|
+ }
|
||||||
|
+ _CLDELETE_CARRAY(language);
|
||||||
|
+ if (stopSet != NULL)
|
||||||
|
+ _CLDELETE(stopSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Builds the named analyzer with the given stop words.
|
||||||
|
@@ -62,12 +77,29 @@
|
||||||
|
result = _CLNEW SnowballFilter(result, language, true);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
+
|
||||||
|
+ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
|
||||||
|
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
|
||||||
|
+
|
||||||
|
+ if (streams == NULL) {
|
||||||
|
+ streams = _CLNEW SavedStreams();
|
||||||
|
+ BufferedReader* bufferedReader = reader->__asBufferedReader();
|
||||||
|
+
|
||||||
|
+ if (bufferedReader == NULL)
|
||||||
|
+ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
|
||||||
|
+ else
|
||||||
|
+ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
|
||||||
|
+
|
||||||
|
+ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
|
||||||
|
+ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
|
||||||
|
+ if (stopSet != NULL)
|
||||||
|
+ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
|
||||||
|
+ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
|
||||||
|
+ setPreviousTokenStream(streams);
|
||||||
|
+ } else
|
||||||
|
+ streams->tokenStream->reset(reader);
|
||||||
|
+ return streams->filteredTokenStream;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
/** Construct the named stemming filter.
|
||||||
|
*
|
@ -0,0 +1,19 @@
|
|||||||
|
--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h 2013-04-01 18:25:10.000000000 -0400
|
||||||
|
@@ -22,6 +22,7 @@
|
||||||
|
class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
|
||||||
|
TCHAR* language;
|
||||||
|
CLTCSetList* stopSet;
|
||||||
|
+ class SavedStreams;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/** Builds the named analyzer with no stop words. */
|
||||||
|
@@ -37,6 +38,8 @@
|
||||||
|
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||||
|
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
|
||||||
|
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
|
||||||
|
+
|
||||||
|
+ TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
|
||||||
|
};
|
||||||
|
|
||||||
|
CL_NS_END2
|
@ -0,0 +1,24 @@
|
|||||||
|
--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c 2013-03-29 18:54:39.000000000 -0400
|
||||||
|
@@ -35,9 +35,8 @@
|
||||||
|
{
|
||||||
|
stemmer_encoding enc;
|
||||||
|
struct stemmer_modules * module;
|
||||||
|
- struct sb_stemmer * stemmer =
|
||||||
|
- (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
||||||
|
- if (stemmer == NULL) return NULL;
|
||||||
|
+ struct sb_stemmer * stemmer;
|
||||||
|
+
|
||||||
|
enc = sb_getenc(charenc);
|
||||||
|
if (enc == ENC_UNKNOWN) return NULL;
|
||||||
|
|
||||||
|
@@ -46,6 +45,9 @@
|
||||||
|
}
|
||||||
|
if (module->name == NULL) return NULL;
|
||||||
|
|
||||||
|
+ stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
|
||||||
|
+ if (stemmer == NULL) return NULL;
|
||||||
|
+
|
||||||
|
stemmer->create = module->create;
|
||||||
|
stemmer->close = module->close;
|
||||||
|
stemmer->stem = module->stem;
|
@ -0,0 +1,67 @@
|
|||||||
|
--- src/core/CLucene/util/BitSet.cpp.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/core/CLucene/util/BitSet.cpp 2013-03-29 17:57:05.000000000 -0400
|
||||||
|
@@ -32,6 +32,25 @@
|
||||||
|
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
||||||
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
||||||
|
|
||||||
|
+const uint8_t BitSet::BYTE_OFFSETS[256] = {
|
||||||
|
+ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||||
|
+ 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
|
||||||
|
+
|
||||||
|
+
|
||||||
|
BitSet::BitSet( const BitSet& copy ) :
|
||||||
|
_size( copy._size ),
|
||||||
|
_count(-1)
|
||||||
|
@@ -180,19 +199,32 @@
|
||||||
|
return factor * (4 + (8+40)*count()) < size();
|
||||||
|
}
|
||||||
|
|
||||||
|
- int32_t BitSet::nextSetBit(int32_t fromIndex) const {
|
||||||
|
+ int32_t BitSet::nextSetBit(int32_t fromIndex) const
|
||||||
|
+ {
|
||||||
|
if (fromIndex < 0)
|
||||||
|
_CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
|
||||||
|
|
||||||
|
if (fromIndex >= _size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
- while (true) {
|
||||||
|
- if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
|
||||||
|
- return fromIndex;
|
||||||
|
- if (++fromIndex == _size)
|
||||||
|
- return -1;
|
||||||
|
+ int _max = ( _size+7 ) >> 3;
|
||||||
|
+
|
||||||
|
+ unsigned int i = (int)( fromIndex>>3 );
|
||||||
|
+ unsigned int subIndex = fromIndex & 0x7; // index within the byte
|
||||||
|
+ uint8_t byte = bits[i] >> subIndex; // skip all the bits to the right of index
|
||||||
|
+
|
||||||
|
+ if ( byte != 0 )
|
||||||
|
+ {
|
||||||
|
+ return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ while( ++i < _max )
|
||||||
|
+ {
|
||||||
|
+ byte = bits[i];
|
||||||
|
+ if ( byte != 0 )
|
||||||
|
+ return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
|
||||||
|
}
|
||||||
|
+ return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
CL_NS_END
|
@ -0,0 +1,10 @@
|
|||||||
|
--- src/core/CLucene/util/BitSet.h.orig 2011-03-16 20:21:07.000000000 -0400
|
||||||
|
+++ src/core/CLucene/util/BitSet.h 2013-03-29 17:57:05.000000000 -0400
|
||||||
|
@@ -39,6 +39,7 @@
|
||||||
|
/** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
|
||||||
|
bool isSparse();
|
||||||
|
static const uint8_t BYTE_COUNTS[256];
|
||||||
|
+ static const uint8_t BYTE_OFFSETS[256];
|
||||||
|
protected:
|
||||||
|
BitSet( const BitSet& copy );
|
||||||
|
|
Loading…
Reference in New Issue
Block a user