mirror of
https://git.FreeBSD.org/ports.git
synced 2024-11-19 00:13:33 +00:00
textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
This commit is contained in:
parent
d8ced9147b
commit
6d884b207a
@ -1496,6 +1496,7 @@
|
||||
SUBDIR += py-rst2html5
|
||||
SUBDIR += py-rstfmt
|
||||
SUBDIR += py-scour
|
||||
SUBDIR += py-sentencepiece
|
||||
SUBDIR += py-simplebayes
|
||||
SUBDIR += py-smartypants
|
||||
SUBDIR += py-snowballstemmer
|
||||
|
26
textproc/py-sentencepiece/Makefile
Normal file
26
textproc/py-sentencepiece/Makefile
Normal file
@ -0,0 +1,26 @@
|
||||
PORTNAME= sentencepiece
|
||||
DISTVERSIONPREFIX= v
|
||||
DISTVERSION= 0.1.97
|
||||
CATEGORIES= textproc # machine-learning
|
||||
PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
|
||||
|
||||
MAINTAINER= yuri@FreeBSD.org
|
||||
COMMENT= Unsupervised text tokenizer for Neural Network-based text generation
|
||||
WWW= https://github.com/google/sentencepiece
|
||||
|
||||
LICENSE= APACHE20
|
||||
LICENSE_FILE= ${WRKSRC}/../LICENSE
|
||||
|
||||
LIB_DEPENDS= libsentencepiece.so:textproc/sentencepiece
|
||||
|
||||
USES= compiler:c++17-lang pkgconfig python
|
||||
USE_PYTHON= distutils autoplist pytest
|
||||
|
||||
USE_GITHUB= yes
|
||||
GH_ACCOUNT= google
|
||||
|
||||
WRKSRC_SUBDIR= python
|
||||
|
||||
TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
|
||||
|
||||
.include <bsd.port.mk>
|
3
textproc/py-sentencepiece/distinfo
Normal file
3
textproc/py-sentencepiece/distinfo
Normal file
@ -0,0 +1,3 @@
|
||||
TIMESTAMP = 1673860778
|
||||
SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b
|
||||
SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436
|
7
textproc/py-sentencepiece/pkg-descr
Normal file
7
textproc/py-sentencepiece/pkg-descr
Normal file
@ -0,0 +1,7 @@
|
||||
SentencePiece is an unsupervised text tokenizer and detokenizer mainly for
|
||||
Neural Network-based text generation systems where the vocabulary size is
|
||||
predetermined prior to the neural model training. SentencePiece implements
|
||||
subword units (e.g., byte-pair-encoding (BPE)) and unigram language model
|
||||
with the extension of direct training from raw sentences. SentencePiece
|
||||
allows us to make a purely end-to-end system that does not depend on
|
||||
language-specific pre/postprocessing.
|
Loading…
Reference in New Issue
Block a user