mirror of
https://git.FreeBSD.org/ports.git
synced 2024-11-23 00:43:28 +00:00
Libtextcat is a library with functions that implement the classification
technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization". It was primarily developed for language guessing, a task on which it is known to perform with near-perfect accuracy. WWW: http://software.wise-guys.nl/libtextcat/
This commit is contained in:
parent
465dc561a1
commit
630a0b255a
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=178873
@ -248,6 +248,7 @@
|
||||
SUBDIR += libstree
|
||||
SUBDIR += libtext-charwidth-perl
|
||||
SUBDIR += libtext-wrapi18n-perl
|
||||
SUBDIR += libtextcat
|
||||
SUBDIR += libtranslate
|
||||
SUBDIR += libtre
|
||||
SUBDIR += libuninameslist
|
||||
|
39
textproc/libtextcat/Makefile
Normal file
39
textproc/libtextcat/Makefile
Normal file
@ -0,0 +1,39 @@
|
||||
# New ports collection makefile for: libtextcat
|
||||
# Date created: Sat 18 nov 2007
|
||||
# Whom: thierry@pompo.net
|
||||
#
|
||||
# $FreeBSD$
|
||||
#
|
||||
|
||||
PORTNAME= libtextcat
|
||||
PORTVERSION= 2.2
|
||||
CATEGORIES= textproc
|
||||
MASTER_SITES= http://software.wise-guys.nl/download/
|
||||
|
||||
MAINTAINER= thierry@FreeBSD.org
|
||||
COMMENT= Language guessing by N-Gram-Based Text Categorization
|
||||
|
||||
GNU_CONFIGURE= yes
|
||||
CONFIGURE_TARGET= --build=${ARCH}-portbld-freebsd${OSREL}
|
||||
USE_LDCONFIG= yes
|
||||
|
||||
PORTDOCS= LICENSE README TODO
|
||||
|
||||
post-install:
|
||||
${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${PREFIX}/include/
|
||||
${MKDIR} ${DATADIR}/LM
|
||||
@${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package"
|
||||
(cd ${WRKSRC}/langclass/LM && \
|
||||
${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${DATADIR}/LM/{}" \;)
|
||||
${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${DATADIR}"
|
||||
${MKDIR} ${DOCSDIR}
|
||||
${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${DOCSDIR}
|
||||
|
||||
regression-test:
|
||||
(cd ${WRKSRC}/langclass/ && \
|
||||
for t in `${LS} ShortTexts/*.txt` ; do \
|
||||
${ECHO_MSG} "Analyzing $$t..." ; \
|
||||
../src/testtextcat conf.txt < $$t ; \
|
||||
done)
|
||||
|
||||
.include <bsd.port.mk>
|
3
textproc/libtextcat/distinfo
Normal file
3
textproc/libtextcat/distinfo
Normal file
@ -0,0 +1,3 @@
|
||||
MD5 (libtextcat-2.2.tar.gz) = 128cfc86ed5953e57fe0f5ae98b62c2e
|
||||
SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655
|
||||
SIZE (libtextcat-2.2.tar.gz) = 540999
|
11
textproc/libtextcat/files/patch-src_Makefile.in
Normal file
11
textproc/libtextcat/files/patch-src_Makefile.in
Normal file
@ -0,0 +1,11 @@
|
||||
--- src/Makefile.in.orig Thu May 22 13:39:52 2003
|
||||
+++ src/Makefile.in Sat Nov 18 22:55:18 2006
|
||||
@@ -126,7 +126,7 @@
|
||||
|
||||
WARNS = -W -Wall -Wshadow -Wpointer-arith
|
||||
IFLAGS =
|
||||
-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
|
||||
+FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
|
||||
VERBOSE = -DVERBOSE
|
||||
AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
|
||||
AM_LDFLAGS = -g
|
17
textproc/libtextcat/pkg-descr
Normal file
17
textproc/libtextcat/pkg-descr
Normal file
@ -0,0 +1,17 @@
|
||||
Libtextcat is a library with functions that implement the classification
|
||||
technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1].
|
||||
It was primarily developed for language guessing, a task on which it is known to
|
||||
perform with near-perfect accuracy.
|
||||
|
||||
The central idea of the Cavnar & Trenkle technique is to calculate a
|
||||
"fingerprint" of a document with an unknown category, and compare this with the
|
||||
fingerprints of a number of documents of which the categories are known. The
|
||||
categories of the closest matches are output as the classification. A
|
||||
fingerprint is a list of the most frequent n-grams occurring in a document,
|
||||
ordered by frequency. Fingerprints are compared with a simple out-of-place
|
||||
metric.
|
||||
|
||||
[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994)
|
||||
N-Gram-Based Text Categorization, <http://citeseer.ist.psu.edu/68861.html>.
|
||||
|
||||
WWW: http://software.wise-guys.nl/libtextcat/
|
85
textproc/libtextcat/pkg-plist
Normal file
85
textproc/libtextcat/pkg-plist
Normal file
@ -0,0 +1,85 @@
|
||||
bin/createfp
|
||||
include/textcat.h
|
||||
lib/libtextcat.a
|
||||
lib/libtextcat.la
|
||||
lib/libtextcat.so
|
||||
lib/libtextcat.so.0
|
||||
%%DATADIR%%/LM/afrikaans.lm
|
||||
%%DATADIR%%/LM/albanian.lm
|
||||
%%DATADIR%%/LM/amharic-utf.lm
|
||||
%%DATADIR%%/LM/arabic-iso8859_6.lm
|
||||
%%DATADIR%%/LM/arabic-windows1256.lm
|
||||
%%DATADIR%%/LM/armenian.lm
|
||||
%%DATADIR%%/LM/basque.lm
|
||||
%%DATADIR%%/LM/belarus-windows1251.lm
|
||||
%%DATADIR%%/LM/bosnian.lm
|
||||
%%DATADIR%%/LM/breton.lm
|
||||
%%DATADIR%%/LM/bulgarian-iso8859_5.lm
|
||||
%%DATADIR%%/LM/catalan.lm
|
||||
%%DATADIR%%/LM/chinese-big5.lm
|
||||
%%DATADIR%%/LM/chinese-gb2312.lm
|
||||
%%DATADIR%%/LM/croatian-ascii.lm
|
||||
%%DATADIR%%/LM/czech-iso8859_2.lm
|
||||
%%DATADIR%%/LM/danish.lm
|
||||
%%DATADIR%%/LM/drents.lm
|
||||
%%DATADIR%%/LM/dutch.lm
|
||||
%%DATADIR%%/LM/english.lm
|
||||
%%DATADIR%%/LM/esperanto.lm
|
||||
%%DATADIR%%/LM/estonian.lm
|
||||
%%DATADIR%%/LM/finnish.lm
|
||||
%%DATADIR%%/LM/french.lm
|
||||
%%DATADIR%%/LM/frisian.lm
|
||||
%%DATADIR%%/LM/georgian.lm
|
||||
%%DATADIR%%/LM/german.lm
|
||||
%%DATADIR%%/LM/greek-iso8859-7.lm
|
||||
%%DATADIR%%/LM/hebrew-iso8859_8.lm
|
||||
%%DATADIR%%/LM/hindi.lm
|
||||
%%DATADIR%%/LM/hungarian.lm
|
||||
%%DATADIR%%/LM/icelandic.lm
|
||||
%%DATADIR%%/LM/indonesian.lm
|
||||
%%DATADIR%%/LM/irish.lm
|
||||
%%DATADIR%%/LM/italian.lm
|
||||
%%DATADIR%%/LM/japanese-euc_jp.lm
|
||||
%%DATADIR%%/LM/japanese-shift_jis.lm
|
||||
%%DATADIR%%/LM/korean.lm
|
||||
%%DATADIR%%/LM/latin.lm
|
||||
%%DATADIR%%/LM/latvian.lm
|
||||
%%DATADIR%%/LM/lithuanian.lm
|
||||
%%DATADIR%%/LM/malay.lm
|
||||
%%DATADIR%%/LM/manx.lm
|
||||
%%DATADIR%%/LM/marathi.lm
|
||||
%%DATADIR%%/LM/middle_frisian.lm
|
||||
%%DATADIR%%/LM/mingo.lm
|
||||
%%DATADIR%%/LM/nepali.lm
|
||||
%%DATADIR%%/LM/norwegian.lm
|
||||
%%DATADIR%%/LM/persian.lm
|
||||
%%DATADIR%%/LM/polish.lm
|
||||
%%DATADIR%%/LM/portuguese.lm
|
||||
%%DATADIR%%/LM/quechua.lm
|
||||
%%DATADIR%%/LM/romanian.lm
|
||||
%%DATADIR%%/LM/rumantsch.lm
|
||||
%%DATADIR%%/LM/russian-iso8859_5.lm
|
||||
%%DATADIR%%/LM/russian-koi8_r.lm
|
||||
%%DATADIR%%/LM/russian-windows1251.lm
|
||||
%%DATADIR%%/LM/sanskrit.lm
|
||||
%%DATADIR%%/LM/scots.lm
|
||||
%%DATADIR%%/LM/scots_gaelic.lm
|
||||
%%DATADIR%%/LM/serbian-ascii.lm
|
||||
%%DATADIR%%/LM/slovak-ascii.lm
|
||||
%%DATADIR%%/LM/slovak-windows1250.lm
|
||||
%%DATADIR%%/LM/slovenian-ascii.lm
|
||||
%%DATADIR%%/LM/slovenian-iso8859_2.lm
|
||||
%%DATADIR%%/LM/spanish.lm
|
||||
%%DATADIR%%/LM/swahili.lm
|
||||
%%DATADIR%%/LM/swedish.lm
|
||||
%%DATADIR%%/LM/tagalog.lm
|
||||
%%DATADIR%%/LM/tamil.lm
|
||||
%%DATADIR%%/LM/thai.lm
|
||||
%%DATADIR%%/LM/turkish.lm
|
||||
%%DATADIR%%/LM/ukrainian-koi8_r.lm
|
||||
%%DATADIR%%/LM/vietnamese.lm
|
||||
%%DATADIR%%/LM/welsh.lm
|
||||
%%DATADIR%%/LM/yiddish-utf.lm
|
||||
%%DATADIR%%/conf.txt
|
||||
@dirrm %%DATADIR%%/LM
|
||||
@dirrm %%DATADIR%%
|
Loading…
Reference in New Issue
Block a user