mirror of
https://git.FreeBSD.org/ports.git
synced 2024-11-18 00:10:04 +00:00
misc/py-datasets: New port: HuggingFace community-driven open-source library of datasets
This commit is contained in:
parent
6da75bae39
commit
a7b24d1d42
@ -417,6 +417,7 @@
|
||||
SUBDIR += py-colorbrewer
|
||||
SUBDIR += py-colored
|
||||
SUBDIR += py-crudini
|
||||
SUBDIR += py-datasets
|
||||
SUBDIR += py-detecta
|
||||
SUBDIR += py-dictdiffer
|
||||
SUBDIR += py-eemeter
|
||||
|
64
misc/py-datasets/Makefile
Normal file
64
misc/py-datasets/Makefile
Normal file
@ -0,0 +1,64 @@
|
||||
PORTNAME= datasets
|
||||
DISTVERSION= 2.20.0
|
||||
CATEGORIES= misc python # machine-learning
|
||||
MASTER_SITES= PYPI
|
||||
PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX}
|
||||
|
||||
MAINTAINER= yuri@FreeBSD.org
|
||||
COMMENT= HuggingFace community-driven open-source library of datasets
|
||||
WWW= https://huggingface.co/docs/datasets/index
|
||||
|
||||
LICENSE= MIT
|
||||
LICENSE_FILE= ${WRKSRC}/LICENSE
|
||||
|
||||
BUILD_DEPENDS= ${PYTHON_PKGNAMEPREFIX}pyproject_hooks>0:devel/py-pyproject_hooks@${PY_FLAVOR} \
|
||||
${PY_SETUPTOOLS} \
|
||||
${PYTHON_PKGNAMEPREFIX}wheel>0:devel/py-wheel@${PY_FLAVOR}
|
||||
RUN_DEPENDS= ${PYTHON_PKGNAMEPREFIX}aiohttp>0:www/py-aiohttp@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}dill>0.3.0<0.3.9:devel/py-dill@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}filelock>0:sysutils/py-filelock@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}fsspec>=2023.1.0:devel/py-fsspec@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}huggingface-hub>0.21.2:misc/py-huggingface-hub@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}multiprocess>0:devel/py-multiprocess@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}numpy>=1.17:math/py-numpy@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}packaging>0:devel/py-packaging@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pandas>0:math/py-pandas@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pyarrow>=15.0.0:databases/py-pyarrow@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pyyaml>=5.1:devel/py-pyyaml@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}requests>=2.32.2:www/py-requests@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}tqdm>=4.66.3:misc/py-tqdm@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}xxhash>0:devel/py-xxhash@${PY_FLAVOR}
|
||||
RUN_DEPENDS+= ${PYTHON_PKGNAMEPREFIX}librosa>0:audio/py-librosa@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}SoundFile>=0.12.1:audio/py-SoundFile@${PY_FLAVOR}
|
||||
RUN_DEPENDS+= ${PY_PILLOW}
|
||||
TEST_DEPENDS= ${PYTHON_PKGNAMEPREFIX}absl-py>=0:devel/py-absl-py@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}elasticsearch>0:textproc/py-elasticsearch@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}faiss>=1.6.4:math/py-faiss@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}jax>=0.3.14:math/py-jax@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}jiwer>0:misc/py-jiwer@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}joblib>=1.3.0:devel/py-joblib@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}lz4>=0:archivers/py-lz4@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}polars>=0.20.0:misc/py-polars@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}protobuf>=4.0.0:devel/py-protobuf@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pytest-datadir>=0:devel/py-pytest-datadir@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pytest-xdist>=0:devel/py-pytest-xdist@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pytest>=0:devel/py-pytest@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}pytorch>=2.0.0:misc/py-pytorch@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}rarfile>=4.0:archivers/py-rarfile@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}s3fs>=2021.11.1:devel/py-s3fs@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}tiktoken>=0:textproc/py-tiktoken@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}typing-extensions>=4.6.1:devel/py-typing-extensions@${PY_FLAVOR} \
|
||||
${PYTHON_PKGNAMEPREFIX}zstandard>=0:archivers/py-zstandard@${PY_FLAVOR}
|
||||
# missing TEST_DEPENDS: jaxlib, joblibspark, py7zr, pyspark, tensorflow
|
||||
|
||||
USES= python
|
||||
USE_PYTHON= pep517 concurrent autoplist pytest
|
||||
|
||||
TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR}
|
||||
|
||||
NO_ARCH= yes
|
||||
|
||||
pre-test: # prevent failure due to missing pyspark
|
||||
@${RM} ${WRKSRC}/tests/packaged_modules/test_spark.py
|
||||
|
||||
.include <bsd.port.mk>
|
3
misc/py-datasets/distinfo
Normal file
3
misc/py-datasets/distinfo
Normal file
@ -0,0 +1,3 @@
|
||||
TIMESTAMP = 1722803032
|
||||
SHA256 (datasets-2.20.0.tar.gz) = 3c4dbcd27e0f642b9d41d20ff2efa721a5e04b32b2ca4009e0fc9139e324553f
|
||||
SIZE (datasets-2.20.0.tar.gz) = 2225757
|
11
misc/py-datasets/files/patch-setup.py
Normal file
11
misc/py-datasets/files/patch-setup.py
Normal file
@ -0,0 +1,11 @@
|
||||
--- setup.py.orig 2024-08-05 18:50:31 UTC
|
||||
+++ setup.py
|
||||
@@ -115,8 +115,6 @@ REQUIRED_PKGS = [
|
||||
# Backend and serialization.
|
||||
# Minimum 15.0.0 to be able to cast dictionary types to their underlying types
|
||||
"pyarrow>=15.0.0",
|
||||
- # As long as we allow pyarrow < 14.0.1, to fix vulnerability CVE-2023-47248
|
||||
- "pyarrow-hotfix",
|
||||
# For smart caching dataset processing
|
||||
"dill>=0.3.0,<0.3.9", # tmp pin until dill has official support for determinism see https://github.com/uqfoundation/dill/issues/19
|
||||
# For performance gains with apache arrow
|
@ -0,0 +1,10 @@
|
||||
--- src/datasets/features/features.py.orig 2024-08-05 18:52:07 UTC
|
||||
+++ src/datasets/features/features.py
|
||||
@@ -32,7 +32,6 @@ import pyarrow.types
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
import pyarrow.types
|
||||
-import pyarrow_hotfix # noqa: F401 # to fix vulnerability on pyarrow<14.0.1
|
||||
from pandas.api.extensions import ExtensionArray as PandasExtensionArray
|
||||
from pandas.api.extensions import ExtensionDtype as PandasExtensionDtype
|
||||
|
9
misc/py-datasets/pkg-descr
Normal file
9
misc/py-datasets/pkg-descr
Normal file
@ -0,0 +1,9 @@
|
||||
Datasets is a library for easily accessing and sharing datasets for Audio,
|
||||
Computer Vision, and Natural Language Processing (NLP) tasks.
|
||||
|
||||
Load a dataset in a single line of code, and use our powerful data processing
|
||||
methods to quickly get your dataset ready for training in a deep learning model.
|
||||
Backed by the Apache Arrow format, process large datasets with zero-copy reads
|
||||
without any memory constraints for optimal speed and efficiency. We also feature
|
||||
a deep integration with the Hugging Face Hub, allowing you to easily load and
|
||||
share a dataset with the wider machine learning community.
|
Loading…
Reference in New Issue
Block a user