- Implement efficient BitSet::nextSetBit() to reduce diff against upstream.

http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7 - Fix a buffer overflow in CJKAnalyzer. Somehow the upstream missed this in 2.3.3.4 branch. http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630 - Fix potential memory leaks in libstemmer. Merged from Snowball changes. http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch - Implement SnowballAnalyzer::reusableTokenStream(). [1] Also, this patch fixes memory leaks found by the submitter. Submitted by: Kishore Ramareddy (kishore at niksun dot com) (initial version) [1] Feature safe: yes
svn path=/head/; revision=315883
2024-12-19 03:52:17 +00:00 · 2013-04-16 18:37:03 +00:00 · 2013-04-16 18:37:03 +00:00 · fa023c7e1a · 2021-03-31 03:12:20 +00:00
commit fa023c7e1a
parent f611dec043
7 changed files with 206 additions and 1 deletions
--- a/textproc/clucene/Makefile
+++ b/textproc/clucene/Makefile
@ -3,7 +3,7 @@

 PORTNAME=	clucene
 PORTVERSION=	2.3.3.4
-PORTREVISION=	1
+PORTREVISION=	2
 CATEGORIES=	textproc
 MASTER_SITES=	SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
 DISTNAME=	${PORTNAME}-core-${PORTVERSION}
--- a/textproc/clucene/files/patch-srccontribs-libCLuceneanalysiscjk__CJKAnalyzer.h
+++ b/textproc/clucene/files/patch-srccontribs-libCLuceneanalysiscjk__CJKAnalyzer.h
@ -0,0 +1,11 @@
+--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h	2013-03-29 18:46:22.000000000 -0400
+@@ -39,7 +39,7 @@
+      * character buffer, store the characters which are used to compose <br>
+      * the returned Token
+      */
+-    TCHAR buffer[LUCENE_MAX_WORD_LEN];
+    TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
+ 
+     /**
+      * I/O buffer, used to store the content of the input(one of the <br>
--- a/textproc/clucene/files/patch-srccontribs-libCLucenesnowballSnowball.cpp
+++ b/textproc/clucene/files/patch-srccontribs-libCLucenesnowballSnowball.cpp
@ -0,0 +1,74 @@
+--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/Snowball.cpp	2013-04-01 19:14:15.000000000 -0400
+@@ -19,16 +19,31 @@
+ 
+ CL_NS_DEF2(analysis,snowball)
+ 
+  class SnowballAnalyzer::SavedStreams : public TokenStream {
+  public:
+    StandardTokenizer* tokenStream;
+    TokenStream* filteredTokenStream;
+
+    SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
+    void close(){}
+    Token* next(Token* token) {return NULL;}
+  };
+  
+   /** Builds the named analyzer with no stop words. */
+   SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
+     this->language = STRDUP_TtoT(language);
+ 	stopSet = NULL;
+   }
+ 
+-  SnowballAnalyzer::~SnowballAnalyzer(){
+-	  _CLDELETE_CARRAY(language);
+-	  if ( stopSet != NULL )
+-		  _CLDELETE(stopSet);
+  SnowballAnalyzer::~SnowballAnalyzer() {
+    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+    if (streams != NULL) {
+      _CLDELETE(streams->filteredTokenStream);
+      _CLDELETE(streams);
+    }
+    _CLDELETE_CARRAY(language);
+    if (stopSet != NULL)
+      _CLDELETE(stopSet);
+   }
+ 
+   /** Builds the named analyzer with the given stop words.
+@@ -62,12 +77,29 @@
+     result = _CLNEW SnowballFilter(result, language, true);
+     return result;
+   }
+-  
+-  
+-  
+-  
+-  
+-  
+
+  TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
+    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+
+    if (streams == NULL) {
+      streams = _CLNEW SavedStreams();
+      BufferedReader* bufferedReader = reader->__asBufferedReader();
+
+      if (bufferedReader == NULL)
+        streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
+      else
+        streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
+
+      streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
+      streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
+      if (stopSet != NULL)
+        streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
+      streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
+      setPreviousTokenStream(streams);
+    } else
+      streams->tokenStream->reset(reader);
+    return streams->filteredTokenStream;
+  }
+   
+     /** Construct the named stemming filter.
+    *
--- a/textproc/clucene/files/patch-srccontribs-libCLucenesnowballSnowballAnalyzer.h
+++ b/textproc/clucene/files/patch-srccontribs-libCLucenesnowballSnowballAnalyzer.h
@ -0,0 +1,19 @@
+--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h	2013-04-01 18:25:10.000000000 -0400
+@@ -22,6 +22,7 @@
+ class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
+   TCHAR* language;
+   CLTCSetList* stopSet;
+  class SavedStreams;
+ 
+ public:
+   /** Builds the named analyzer with no stop words. */
+@@ -37,6 +38,8 @@
+       StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
+
+  TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+ };
+ 
+ CL_NS_END2
--- a/textproc/clucene/files/patch-srccontribs-libCLucenesnowballlibstemmer__libstemmer.c
+++ b/textproc/clucene/files/patch-srccontribs-libCLucenesnowballlibstemmer__libstemmer.c
@ -0,0 +1,24 @@
+--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c	2013-03-29 18:54:39.000000000 -0400
+@@ -35,9 +35,8 @@
+ {
+     stemmer_encoding enc;
+     struct stemmer_modules * module;
+-    struct sb_stemmer * stemmer =
+-	    (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+-    if (stemmer == NULL) return NULL;
+    struct sb_stemmer * stemmer;
+
+     enc = sb_getenc(charenc);
+     if (enc == ENC_UNKNOWN) return NULL;
+ 
+@@ -46,6 +45,9 @@
+     }
+     if (module->name == NULL) return NULL;
+     
+    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+    if (stemmer == NULL) return NULL;
+
+     stemmer->create = module->create;
+     stemmer->close = module->close;
+     stemmer->stem = module->stem;
--- a/textproc/clucene/files/patch-srccoreCLuceneutilBitSet.cpp
+++ b/textproc/clucene/files/patch-srccoreCLuceneutilBitSet.cpp
@ -0,0 +1,67 @@
+--- src/core/CLucene/util/BitSet.cpp.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/core/CLucene/util/BitSet.cpp	2013-03-29 17:57:05.000000000 -0400
+@@ -32,6 +32,25 @@
+     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+ 
+const uint8_t BitSet::BYTE_OFFSETS[256] = {
+    8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
+    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+
+ BitSet::BitSet( const BitSet& copy ) :
+ 	_size( copy._size ),
+ 	_count(-1)
+@@ -180,19 +199,32 @@
+     return                            factor * (4 + (8+40)*count()) < size();
+   }
+ 
+-  int32_t BitSet::nextSetBit(int32_t fromIndex) const {
+  int32_t BitSet::nextSetBit(int32_t fromIndex) const 
+  {
+       if (fromIndex < 0)
+           _CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
+ 
+       if (fromIndex >= _size)
+           return -1;
+ 
+-      while (true) {
+-          if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
+-              return fromIndex;
+-          if (++fromIndex == _size)
+-              return -1;
+      int _max = ( _size+7 ) >> 3;
+
+      unsigned int i = (int)( fromIndex>>3 );
+      unsigned int subIndex = fromIndex & 0x7; // index within the byte
+      uint8_t byte = bits[i] >> subIndex;  // skip all the bits to the right of index
+
+      if ( byte != 0 ) 
+      {
+          return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
+      }
+
+      while( ++i < _max ) 
+      {
+          byte = bits[i];
+          if ( byte != 0 ) 
+              return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
+       }
+      return -1;
+   }
+ 
+ CL_NS_END
--- a/textproc/clucene/files/patch-srccoreCLuceneutilBitSet.h
+++ b/textproc/clucene/files/patch-srccoreCLuceneutilBitSet.h
@ -0,0 +1,10 @@
+--- src/core/CLucene/util/BitSet.h.orig	2011-03-16 20:21:07.000000000 -0400
+++ src/core/CLucene/util/BitSet.h	2013-03-29 17:57:05.000000000 -0400
+@@ -39,6 +39,7 @@
+   /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+   bool isSparse();
+   static const uint8_t BYTE_COUNTS[256];
+  static const uint8_t BYTE_OFFSETS[256];
+ protected:
+ 	BitSet( const BitSet& copy );
+