textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation

2024-11-19 00:13:33 +00:00 · 2023-01-24 23:45:57 -08:00 · 2023-01-24 23:45:57 -08:00 · 6d884b207a
commit 6d884b207a
parent d8ced9147b
4 changed files with 37 additions and 0 deletions
--- a/textproc/Makefile
+++ b/textproc/Makefile
@ -1496,6 +1496,7 @@
    SUBDIR += py-rst2html5
    SUBDIR += py-rstfmt
    SUBDIR += py-scour
+    SUBDIR += py-sentencepiece
    SUBDIR += py-simplebayes
    SUBDIR += py-smartypants
    SUBDIR += py-snowballstemmer
--- a/textproc/py-sentencepiece/Makefile
+++ b/textproc/py-sentencepiece/Makefile
@ -0,0 +1,26 @@
+PORTNAME=	sentencepiece
+DISTVERSIONPREFIX=	v
+DISTVERSION=	0.1.97
+CATEGORIES=	textproc # machine-learning
+PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
+
+MAINTAINER=	yuri@FreeBSD.org
+COMMENT=	Unsupervised text tokenizer for Neural Network-based text generation
+WWW=		https://github.com/google/sentencepiece
+
+LICENSE=	APACHE20
+LICENSE_FILE=	${WRKSRC}/../LICENSE
+
+LIB_DEPENDS=	libsentencepiece.so:textproc/sentencepiece
+
+USES=		compiler:c++17-lang pkgconfig python
+USE_PYTHON=	distutils autoplist pytest
+
+USE_GITHUB=	yes
+GH_ACCOUNT=	google
+
+WRKSRC_SUBDIR=	python
+
+TEST_ENV=	${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
+
+.include <bsd.port.mk>
--- a/textproc/py-sentencepiece/distinfo
+++ b/textproc/py-sentencepiece/distinfo
@ -0,0 +1,3 @@
+TIMESTAMP = 1673860778
+SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b
+SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436
--- a/textproc/py-sentencepiece/pkg-descr
+++ b/textproc/py-sentencepiece/pkg-descr
@ -0,0 +1,7 @@
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for
+Neural Network-based text generation systems where the vocabulary size is
+predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE)) and unigram language model
+with the extension of direct training from raw sentences. SentencePiece
+allows us to make a purely end-to-end system that does not depend on
+language-specific pre/postprocessing.