From 630a0b255a09d846287b5beb57bf60dbcf38d21b Mon Sep 17 00:00:00 2001 From: Thierry Thomas Date: Mon, 4 Dec 2006 21:45:23 +0000 Subject: [PATCH] Libtextcat is a library with functions that implement the classification technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization". It was primarily developed for language guessing, a task on which it is known to perform with near-perfect accuracy. WWW: http://software.wise-guys.nl/libtextcat/ --- textproc/Makefile | 1 + textproc/libtextcat/Makefile | 39 +++++++++ textproc/libtextcat/distinfo | 3 + .../libtextcat/files/patch-src_Makefile.in | 11 +++ textproc/libtextcat/pkg-descr | 17 ++++ textproc/libtextcat/pkg-plist | 85 +++++++++++++++++++ 6 files changed, 156 insertions(+) create mode 100644 textproc/libtextcat/Makefile create mode 100644 textproc/libtextcat/distinfo create mode 100644 textproc/libtextcat/files/patch-src_Makefile.in create mode 100644 textproc/libtextcat/pkg-descr create mode 100644 textproc/libtextcat/pkg-plist diff --git a/textproc/Makefile b/textproc/Makefile index bd5f2617fb30..47cc6e6a60d3 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -248,6 +248,7 @@ SUBDIR += libstree SUBDIR += libtext-charwidth-perl SUBDIR += libtext-wrapi18n-perl + SUBDIR += libtextcat SUBDIR += libtranslate SUBDIR += libtre SUBDIR += libuninameslist diff --git a/textproc/libtextcat/Makefile b/textproc/libtextcat/Makefile new file mode 100644 index 000000000000..9dbb3a0bafad --- /dev/null +++ b/textproc/libtextcat/Makefile @@ -0,0 +1,39 @@ +# New ports collection makefile for: libtextcat +# Date created: Sat 18 nov 2007 +# Whom: thierry@pompo.net +# +# $FreeBSD$ +# + +PORTNAME= libtextcat +PORTVERSION= 2.2 +CATEGORIES= textproc +MASTER_SITES= http://software.wise-guys.nl/download/ + +MAINTAINER= thierry@FreeBSD.org +COMMENT= Language guessing by N-Gram-Based Text Categorization + +GNU_CONFIGURE= yes +CONFIGURE_TARGET= --build=${ARCH}-portbld-freebsd${OSREL} +USE_LDCONFIG= yes + +PORTDOCS= LICENSE README TODO + +post-install: + ${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${PREFIX}/include/ + ${MKDIR} ${DATADIR}/LM + @${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package" + (cd ${WRKSRC}/langclass/LM && \ + ${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${DATADIR}/LM/{}" \;) + ${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${DATADIR}" + ${MKDIR} ${DOCSDIR} + ${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${DOCSDIR} + +regression-test: + (cd ${WRKSRC}/langclass/ && \ + for t in `${LS} ShortTexts/*.txt` ; do \ + ${ECHO_MSG} "Analyzing $$t..." ; \ + ../src/testtextcat conf.txt < $$t ; \ + done) + +.include diff --git a/textproc/libtextcat/distinfo b/textproc/libtextcat/distinfo new file mode 100644 index 000000000000..00bb5f2dcbd5 --- /dev/null +++ b/textproc/libtextcat/distinfo @@ -0,0 +1,3 @@ +MD5 (libtextcat-2.2.tar.gz) = 128cfc86ed5953e57fe0f5ae98b62c2e +SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655 +SIZE (libtextcat-2.2.tar.gz) = 540999 diff --git a/textproc/libtextcat/files/patch-src_Makefile.in b/textproc/libtextcat/files/patch-src_Makefile.in new file mode 100644 index 000000000000..835d7c67e038 --- /dev/null +++ b/textproc/libtextcat/files/patch-src_Makefile.in @@ -0,0 +1,11 @@ +--- src/Makefile.in.orig Thu May 22 13:39:52 2003 ++++ src/Makefile.in Sat Nov 18 22:55:18 2006 +@@ -126,7 +126,7 @@ + + WARNS = -W -Wall -Wshadow -Wpointer-arith + IFLAGS = +-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE ++FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g diff --git a/textproc/libtextcat/pkg-descr b/textproc/libtextcat/pkg-descr new file mode 100644 index 000000000000..c0a7e7660e16 --- /dev/null +++ b/textproc/libtextcat/pkg-descr @@ -0,0 +1,17 @@ +Libtextcat is a library with functions that implement the classification +technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1]. +It was primarily developed for language guessing, a task on which it is known to +perform with near-perfect accuracy. + +The central idea of the Cavnar & Trenkle technique is to calculate a +"fingerprint" of a document with an unknown category, and compare this with the +fingerprints of a number of documents of which the categories are known. The +categories of the closest matches are output as the classification. A +fingerprint is a list of the most frequent n-grams occurring in a document, +ordered by frequency. Fingerprints are compared with a simple out-of-place +metric. + +[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994) +N-Gram-Based Text Categorization, . + +WWW: http://software.wise-guys.nl/libtextcat/ diff --git a/textproc/libtextcat/pkg-plist b/textproc/libtextcat/pkg-plist new file mode 100644 index 000000000000..74a45fa7fd43 --- /dev/null +++ b/textproc/libtextcat/pkg-plist @@ -0,0 +1,85 @@ +bin/createfp +include/textcat.h +lib/libtextcat.a +lib/libtextcat.la +lib/libtextcat.so +lib/libtextcat.so.0 +%%DATADIR%%/LM/afrikaans.lm +%%DATADIR%%/LM/albanian.lm +%%DATADIR%%/LM/amharic-utf.lm +%%DATADIR%%/LM/arabic-iso8859_6.lm +%%DATADIR%%/LM/arabic-windows1256.lm +%%DATADIR%%/LM/armenian.lm +%%DATADIR%%/LM/basque.lm +%%DATADIR%%/LM/belarus-windows1251.lm +%%DATADIR%%/LM/bosnian.lm +%%DATADIR%%/LM/breton.lm +%%DATADIR%%/LM/bulgarian-iso8859_5.lm +%%DATADIR%%/LM/catalan.lm +%%DATADIR%%/LM/chinese-big5.lm +%%DATADIR%%/LM/chinese-gb2312.lm +%%DATADIR%%/LM/croatian-ascii.lm +%%DATADIR%%/LM/czech-iso8859_2.lm +%%DATADIR%%/LM/danish.lm +%%DATADIR%%/LM/drents.lm +%%DATADIR%%/LM/dutch.lm +%%DATADIR%%/LM/english.lm +%%DATADIR%%/LM/esperanto.lm +%%DATADIR%%/LM/estonian.lm +%%DATADIR%%/LM/finnish.lm +%%DATADIR%%/LM/french.lm +%%DATADIR%%/LM/frisian.lm +%%DATADIR%%/LM/georgian.lm +%%DATADIR%%/LM/german.lm +%%DATADIR%%/LM/greek-iso8859-7.lm +%%DATADIR%%/LM/hebrew-iso8859_8.lm +%%DATADIR%%/LM/hindi.lm +%%DATADIR%%/LM/hungarian.lm +%%DATADIR%%/LM/icelandic.lm +%%DATADIR%%/LM/indonesian.lm +%%DATADIR%%/LM/irish.lm +%%DATADIR%%/LM/italian.lm +%%DATADIR%%/LM/japanese-euc_jp.lm +%%DATADIR%%/LM/japanese-shift_jis.lm +%%DATADIR%%/LM/korean.lm +%%DATADIR%%/LM/latin.lm +%%DATADIR%%/LM/latvian.lm +%%DATADIR%%/LM/lithuanian.lm +%%DATADIR%%/LM/malay.lm +%%DATADIR%%/LM/manx.lm +%%DATADIR%%/LM/marathi.lm +%%DATADIR%%/LM/middle_frisian.lm +%%DATADIR%%/LM/mingo.lm +%%DATADIR%%/LM/nepali.lm +%%DATADIR%%/LM/norwegian.lm +%%DATADIR%%/LM/persian.lm +%%DATADIR%%/LM/polish.lm +%%DATADIR%%/LM/portuguese.lm +%%DATADIR%%/LM/quechua.lm +%%DATADIR%%/LM/romanian.lm +%%DATADIR%%/LM/rumantsch.lm +%%DATADIR%%/LM/russian-iso8859_5.lm +%%DATADIR%%/LM/russian-koi8_r.lm +%%DATADIR%%/LM/russian-windows1251.lm +%%DATADIR%%/LM/sanskrit.lm +%%DATADIR%%/LM/scots.lm +%%DATADIR%%/LM/scots_gaelic.lm +%%DATADIR%%/LM/serbian-ascii.lm +%%DATADIR%%/LM/slovak-ascii.lm +%%DATADIR%%/LM/slovak-windows1250.lm +%%DATADIR%%/LM/slovenian-ascii.lm +%%DATADIR%%/LM/slovenian-iso8859_2.lm +%%DATADIR%%/LM/spanish.lm +%%DATADIR%%/LM/swahili.lm +%%DATADIR%%/LM/swedish.lm +%%DATADIR%%/LM/tagalog.lm +%%DATADIR%%/LM/tamil.lm +%%DATADIR%%/LM/thai.lm +%%DATADIR%%/LM/turkish.lm +%%DATADIR%%/LM/ukrainian-koi8_r.lm +%%DATADIR%%/LM/vietnamese.lm +%%DATADIR%%/LM/welsh.lm +%%DATADIR%%/LM/yiddish-utf.lm +%%DATADIR%%/conf.txt +@dirrm %%DATADIR%%/LM +@dirrm %%DATADIR%%