From 630a0b255a09d846287b5beb57bf60dbcf38d21b Mon Sep 17 00:00:00 2001
From: Thierry Thomas <thierry@FreeBSD.org>
Date: Mon, 4 Dec 2006 21:45:23 +0000
Subject: [PATCH] Libtextcat is a library with functions that implement the
 classification technique described in Cavnar & Trenkle, "N-Gram-Based Text
 Categorization". It was primarily developed for language guessing, a task on
 which it is known to perform with near-perfect accuracy.

WWW: http://software.wise-guys.nl/libtextcat/
---
 textproc/Makefile                             |  1 +
 textproc/libtextcat/Makefile                  | 39 +++++++++
 textproc/libtextcat/distinfo                  |  3 +
 .../libtextcat/files/patch-src_Makefile.in    | 11 +++
 textproc/libtextcat/pkg-descr                 | 17 ++++
 textproc/libtextcat/pkg-plist                 | 85 +++++++++++++++++++
 6 files changed, 156 insertions(+)
 create mode 100644 textproc/libtextcat/Makefile
 create mode 100644 textproc/libtextcat/distinfo
 create mode 100644 textproc/libtextcat/files/patch-src_Makefile.in
 create mode 100644 textproc/libtextcat/pkg-descr
 create mode 100644 textproc/libtextcat/pkg-plist

diff --git a/textproc/Makefile b/textproc/Makefile
index bd5f2617fb30..47cc6e6a60d3 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -248,6 +248,7 @@
     SUBDIR += libstree
     SUBDIR += libtext-charwidth-perl
     SUBDIR += libtext-wrapi18n-perl
+    SUBDIR += libtextcat
     SUBDIR += libtranslate
     SUBDIR += libtre
     SUBDIR += libuninameslist
diff --git a/textproc/libtextcat/Makefile b/textproc/libtextcat/Makefile
new file mode 100644
index 000000000000..9dbb3a0bafad
--- /dev/null
+++ b/textproc/libtextcat/Makefile
@@ -0,0 +1,39 @@
+# New ports collection makefile for:	libtextcat
+# Date created:		Sat 18 nov 2007
+# Whom:			thierry@pompo.net
+#
+# $FreeBSD$
+#
+
+PORTNAME=	libtextcat
+PORTVERSION=	2.2
+CATEGORIES=	textproc
+MASTER_SITES=	http://software.wise-guys.nl/download/
+
+MAINTAINER=	thierry@FreeBSD.org
+COMMENT=	Language guessing by N-Gram-Based Text Categorization
+
+GNU_CONFIGURE=	yes
+CONFIGURE_TARGET=	--build=${ARCH}-portbld-freebsd${OSREL}
+USE_LDCONFIG=	yes
+
+PORTDOCS=	LICENSE README TODO
+
+post-install:
+	${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${PREFIX}/include/
+	${MKDIR} ${DATADIR}/LM
+	@${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package"
+	(cd ${WRKSRC}/langclass/LM &&		\
+	${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${DATADIR}/LM/{}" \;)
+	${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${DATADIR}"
+	${MKDIR} ${DOCSDIR}
+	${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${DOCSDIR}
+
+regression-test:
+	(cd ${WRKSRC}/langclass/ &&		\
+	for t in `${LS} ShortTexts/*.txt` ; do	\
+	${ECHO_MSG} "Analyzing $$t..." ;	\
+	../src/testtextcat conf.txt < $$t ;	\
+	done)
+
+.include <bsd.port.mk>
diff --git a/textproc/libtextcat/distinfo b/textproc/libtextcat/distinfo
new file mode 100644
index 000000000000..00bb5f2dcbd5
--- /dev/null
+++ b/textproc/libtextcat/distinfo
@@ -0,0 +1,3 @@
+MD5 (libtextcat-2.2.tar.gz) = 128cfc86ed5953e57fe0f5ae98b62c2e
+SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655
+SIZE (libtextcat-2.2.tar.gz) = 540999
diff --git a/textproc/libtextcat/files/patch-src_Makefile.in b/textproc/libtextcat/files/patch-src_Makefile.in
new file mode 100644
index 000000000000..835d7c67e038
--- /dev/null
+++ b/textproc/libtextcat/files/patch-src_Makefile.in
@@ -0,0 +1,11 @@
+--- src/Makefile.in.orig	Thu May 22 13:39:52 2003
++++ src/Makefile.in	Sat Nov 18 22:55:18 2006
+@@ -126,7 +126,7 @@
+ 
+ WARNS = -W -Wall -Wshadow -Wpointer-arith 
+ IFLAGS = 
+-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
++FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+ VERBOSE = -DVERBOSE
+ AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
+ AM_LDFLAGS = -g
diff --git a/textproc/libtextcat/pkg-descr b/textproc/libtextcat/pkg-descr
new file mode 100644
index 000000000000..c0a7e7660e16
--- /dev/null
+++ b/textproc/libtextcat/pkg-descr
@@ -0,0 +1,17 @@
+Libtextcat is a library with functions that implement the classification
+technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1].
+It was primarily developed for language guessing, a task on which it is known to
+perform with near-perfect accuracy.
+
+The central idea of the Cavnar & Trenkle technique is to calculate a
+"fingerprint" of a document with an unknown category, and compare this with the
+fingerprints of a number of documents of which the categories are known. The
+categories of the closest matches are output as the classification. A
+fingerprint is a list of the most frequent n-grams occurring in a document,
+ordered by frequency. Fingerprints are compared with a simple out-of-place
+metric.
+
+[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994)
+N-Gram-Based Text Categorization, <http://citeseer.ist.psu.edu/68861.html>.
+
+WWW: http://software.wise-guys.nl/libtextcat/
diff --git a/textproc/libtextcat/pkg-plist b/textproc/libtextcat/pkg-plist
new file mode 100644
index 000000000000..74a45fa7fd43
--- /dev/null
+++ b/textproc/libtextcat/pkg-plist
@@ -0,0 +1,85 @@
+bin/createfp
+include/textcat.h
+lib/libtextcat.a
+lib/libtextcat.la
+lib/libtextcat.so
+lib/libtextcat.so.0
+%%DATADIR%%/LM/afrikaans.lm
+%%DATADIR%%/LM/albanian.lm
+%%DATADIR%%/LM/amharic-utf.lm
+%%DATADIR%%/LM/arabic-iso8859_6.lm
+%%DATADIR%%/LM/arabic-windows1256.lm
+%%DATADIR%%/LM/armenian.lm
+%%DATADIR%%/LM/basque.lm
+%%DATADIR%%/LM/belarus-windows1251.lm
+%%DATADIR%%/LM/bosnian.lm
+%%DATADIR%%/LM/breton.lm
+%%DATADIR%%/LM/bulgarian-iso8859_5.lm
+%%DATADIR%%/LM/catalan.lm
+%%DATADIR%%/LM/chinese-big5.lm
+%%DATADIR%%/LM/chinese-gb2312.lm
+%%DATADIR%%/LM/croatian-ascii.lm
+%%DATADIR%%/LM/czech-iso8859_2.lm
+%%DATADIR%%/LM/danish.lm
+%%DATADIR%%/LM/drents.lm
+%%DATADIR%%/LM/dutch.lm
+%%DATADIR%%/LM/english.lm
+%%DATADIR%%/LM/esperanto.lm
+%%DATADIR%%/LM/estonian.lm
+%%DATADIR%%/LM/finnish.lm
+%%DATADIR%%/LM/french.lm
+%%DATADIR%%/LM/frisian.lm
+%%DATADIR%%/LM/georgian.lm
+%%DATADIR%%/LM/german.lm
+%%DATADIR%%/LM/greek-iso8859-7.lm
+%%DATADIR%%/LM/hebrew-iso8859_8.lm
+%%DATADIR%%/LM/hindi.lm
+%%DATADIR%%/LM/hungarian.lm
+%%DATADIR%%/LM/icelandic.lm
+%%DATADIR%%/LM/indonesian.lm
+%%DATADIR%%/LM/irish.lm
+%%DATADIR%%/LM/italian.lm
+%%DATADIR%%/LM/japanese-euc_jp.lm
+%%DATADIR%%/LM/japanese-shift_jis.lm
+%%DATADIR%%/LM/korean.lm
+%%DATADIR%%/LM/latin.lm
+%%DATADIR%%/LM/latvian.lm
+%%DATADIR%%/LM/lithuanian.lm
+%%DATADIR%%/LM/malay.lm
+%%DATADIR%%/LM/manx.lm
+%%DATADIR%%/LM/marathi.lm
+%%DATADIR%%/LM/middle_frisian.lm
+%%DATADIR%%/LM/mingo.lm
+%%DATADIR%%/LM/nepali.lm
+%%DATADIR%%/LM/norwegian.lm
+%%DATADIR%%/LM/persian.lm
+%%DATADIR%%/LM/polish.lm
+%%DATADIR%%/LM/portuguese.lm
+%%DATADIR%%/LM/quechua.lm
+%%DATADIR%%/LM/romanian.lm
+%%DATADIR%%/LM/rumantsch.lm
+%%DATADIR%%/LM/russian-iso8859_5.lm
+%%DATADIR%%/LM/russian-koi8_r.lm
+%%DATADIR%%/LM/russian-windows1251.lm
+%%DATADIR%%/LM/sanskrit.lm
+%%DATADIR%%/LM/scots.lm
+%%DATADIR%%/LM/scots_gaelic.lm
+%%DATADIR%%/LM/serbian-ascii.lm
+%%DATADIR%%/LM/slovak-ascii.lm
+%%DATADIR%%/LM/slovak-windows1250.lm
+%%DATADIR%%/LM/slovenian-ascii.lm
+%%DATADIR%%/LM/slovenian-iso8859_2.lm
+%%DATADIR%%/LM/spanish.lm
+%%DATADIR%%/LM/swahili.lm
+%%DATADIR%%/LM/swedish.lm
+%%DATADIR%%/LM/tagalog.lm
+%%DATADIR%%/LM/tamil.lm
+%%DATADIR%%/LM/thai.lm
+%%DATADIR%%/LM/turkish.lm
+%%DATADIR%%/LM/ukrainian-koi8_r.lm
+%%DATADIR%%/LM/vietnamese.lm
+%%DATADIR%%/LM/welsh.lm
+%%DATADIR%%/LM/yiddish-utf.lm
+%%DATADIR%%/conf.txt
+@dirrm %%DATADIR%%/LM
+@dirrm %%DATADIR%%