1
0
mirror of https://git.FreeBSD.org/ports.git synced 2024-11-23 00:43:28 +00:00

Libtextcat is a library with functions that implement the classification

technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization".
It was primarily developed for language guessing, a task on which it is known to
perform with near-perfect accuracy.

WWW: http://software.wise-guys.nl/libtextcat/
This commit is contained in:
Thierry Thomas 2006-12-04 21:45:23 +00:00
parent 465dc561a1
commit 630a0b255a
Notes: svn2git 2021-03-31 03:12:20 +00:00
svn path=/head/; revision=178873
6 changed files with 156 additions and 0 deletions

View File

@ -248,6 +248,7 @@
SUBDIR += libstree
SUBDIR += libtext-charwidth-perl
SUBDIR += libtext-wrapi18n-perl
SUBDIR += libtextcat
SUBDIR += libtranslate
SUBDIR += libtre
SUBDIR += libuninameslist

View File

@ -0,0 +1,39 @@
# New ports collection makefile for: libtextcat
# Date created: Sat 18 nov 2007
# Whom: thierry@pompo.net
#
# $FreeBSD$
#
PORTNAME= libtextcat
PORTVERSION= 2.2
CATEGORIES= textproc
MASTER_SITES= http://software.wise-guys.nl/download/
MAINTAINER= thierry@FreeBSD.org
COMMENT= Language guessing by N-Gram-Based Text Categorization
GNU_CONFIGURE= yes
CONFIGURE_TARGET= --build=${ARCH}-portbld-freebsd${OSREL}
USE_LDCONFIG= yes
PORTDOCS= LICENSE README TODO
post-install:
${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${PREFIX}/include/
${MKDIR} ${DATADIR}/LM
@${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package"
(cd ${WRKSRC}/langclass/LM && \
${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${DATADIR}/LM/{}" \;)
${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${DATADIR}"
${MKDIR} ${DOCSDIR}
${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${DOCSDIR}
regression-test:
(cd ${WRKSRC}/langclass/ && \
for t in `${LS} ShortTexts/*.txt` ; do \
${ECHO_MSG} "Analyzing $$t..." ; \
../src/testtextcat conf.txt < $$t ; \
done)
.include <bsd.port.mk>

View File

@ -0,0 +1,3 @@
MD5 (libtextcat-2.2.tar.gz) = 128cfc86ed5953e57fe0f5ae98b62c2e
SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655
SIZE (libtextcat-2.2.tar.gz) = 540999

View File

@ -0,0 +1,11 @@
--- src/Makefile.in.orig Thu May 22 13:39:52 2003
+++ src/Makefile.in Sat Nov 18 22:55:18 2006
@@ -126,7 +126,7 @@
WARNS = -W -Wall -Wshadow -Wpointer-arith
IFLAGS =
-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
VERBOSE = -DVERBOSE
AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
AM_LDFLAGS = -g

View File

@ -0,0 +1,17 @@
Libtextcat is a library with functions that implement the classification
technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1].
It was primarily developed for language guessing, a task on which it is known to
perform with near-perfect accuracy.
The central idea of the Cavnar & Trenkle technique is to calculate a
"fingerprint" of a document with an unknown category, and compare this with the
fingerprints of a number of documents of which the categories are known. The
categories of the closest matches are output as the classification. A
fingerprint is a list of the most frequent n-grams occurring in a document,
ordered by frequency. Fingerprints are compared with a simple out-of-place
metric.
[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994)
N-Gram-Based Text Categorization, <http://citeseer.ist.psu.edu/68861.html>.
WWW: http://software.wise-guys.nl/libtextcat/

View File

@ -0,0 +1,85 @@
bin/createfp
include/textcat.h
lib/libtextcat.a
lib/libtextcat.la
lib/libtextcat.so
lib/libtextcat.so.0
%%DATADIR%%/LM/afrikaans.lm
%%DATADIR%%/LM/albanian.lm
%%DATADIR%%/LM/amharic-utf.lm
%%DATADIR%%/LM/arabic-iso8859_6.lm
%%DATADIR%%/LM/arabic-windows1256.lm
%%DATADIR%%/LM/armenian.lm
%%DATADIR%%/LM/basque.lm
%%DATADIR%%/LM/belarus-windows1251.lm
%%DATADIR%%/LM/bosnian.lm
%%DATADIR%%/LM/breton.lm
%%DATADIR%%/LM/bulgarian-iso8859_5.lm
%%DATADIR%%/LM/catalan.lm
%%DATADIR%%/LM/chinese-big5.lm
%%DATADIR%%/LM/chinese-gb2312.lm
%%DATADIR%%/LM/croatian-ascii.lm
%%DATADIR%%/LM/czech-iso8859_2.lm
%%DATADIR%%/LM/danish.lm
%%DATADIR%%/LM/drents.lm
%%DATADIR%%/LM/dutch.lm
%%DATADIR%%/LM/english.lm
%%DATADIR%%/LM/esperanto.lm
%%DATADIR%%/LM/estonian.lm
%%DATADIR%%/LM/finnish.lm
%%DATADIR%%/LM/french.lm
%%DATADIR%%/LM/frisian.lm
%%DATADIR%%/LM/georgian.lm
%%DATADIR%%/LM/german.lm
%%DATADIR%%/LM/greek-iso8859-7.lm
%%DATADIR%%/LM/hebrew-iso8859_8.lm
%%DATADIR%%/LM/hindi.lm
%%DATADIR%%/LM/hungarian.lm
%%DATADIR%%/LM/icelandic.lm
%%DATADIR%%/LM/indonesian.lm
%%DATADIR%%/LM/irish.lm
%%DATADIR%%/LM/italian.lm
%%DATADIR%%/LM/japanese-euc_jp.lm
%%DATADIR%%/LM/japanese-shift_jis.lm
%%DATADIR%%/LM/korean.lm
%%DATADIR%%/LM/latin.lm
%%DATADIR%%/LM/latvian.lm
%%DATADIR%%/LM/lithuanian.lm
%%DATADIR%%/LM/malay.lm
%%DATADIR%%/LM/manx.lm
%%DATADIR%%/LM/marathi.lm
%%DATADIR%%/LM/middle_frisian.lm
%%DATADIR%%/LM/mingo.lm
%%DATADIR%%/LM/nepali.lm
%%DATADIR%%/LM/norwegian.lm
%%DATADIR%%/LM/persian.lm
%%DATADIR%%/LM/polish.lm
%%DATADIR%%/LM/portuguese.lm
%%DATADIR%%/LM/quechua.lm
%%DATADIR%%/LM/romanian.lm
%%DATADIR%%/LM/rumantsch.lm
%%DATADIR%%/LM/russian-iso8859_5.lm
%%DATADIR%%/LM/russian-koi8_r.lm
%%DATADIR%%/LM/russian-windows1251.lm
%%DATADIR%%/LM/sanskrit.lm
%%DATADIR%%/LM/scots.lm
%%DATADIR%%/LM/scots_gaelic.lm
%%DATADIR%%/LM/serbian-ascii.lm
%%DATADIR%%/LM/slovak-ascii.lm
%%DATADIR%%/LM/slovak-windows1250.lm
%%DATADIR%%/LM/slovenian-ascii.lm
%%DATADIR%%/LM/slovenian-iso8859_2.lm
%%DATADIR%%/LM/spanish.lm
%%DATADIR%%/LM/swahili.lm
%%DATADIR%%/LM/swedish.lm
%%DATADIR%%/LM/tagalog.lm
%%DATADIR%%/LM/tamil.lm
%%DATADIR%%/LM/thai.lm
%%DATADIR%%/LM/turkish.lm
%%DATADIR%%/LM/ukrainian-koi8_r.lm
%%DATADIR%%/LM/vietnamese.lm
%%DATADIR%%/LM/welsh.lm
%%DATADIR%%/LM/yiddish-utf.lm
%%DATADIR%%/conf.txt
@dirrm %%DATADIR%%/LM
@dirrm %%DATADIR%%