184 lines
3.5 KiB
Nix
184 lines
3.5 KiB
Nix
{
|
|
lib,
|
|
buildPythonPackage,
|
|
fetchFromGitHub,
|
|
|
|
# build system
|
|
poetry-core,
|
|
|
|
# dependencies
|
|
accelerate,
|
|
beautifulsoup4,
|
|
certifi,
|
|
docling-core,
|
|
docling-ibm-models,
|
|
docling-parse,
|
|
easyocr,
|
|
filetype,
|
|
huggingface-hub,
|
|
lxml,
|
|
marko,
|
|
# ocrmac # not yet packaged
|
|
onnxruntime,
|
|
openpyxl,
|
|
pandas,
|
|
pillow,
|
|
pluggy,
|
|
pydantic,
|
|
pydantic-settings,
|
|
pylatexenc,
|
|
pypdfium2,
|
|
python-docx,
|
|
python-pptx,
|
|
rapidocr-onnxruntime,
|
|
requests,
|
|
rtree,
|
|
scipy,
|
|
tesserocr,
|
|
tqdm,
|
|
transformers,
|
|
typer,
|
|
|
|
# optional dependencies
|
|
# mkdocs-click # not yet packaged
|
|
mkdocs-jupyter,
|
|
mkdocs-material,
|
|
mkdocstrings,
|
|
|
|
# tests
|
|
pytestCheckHook,
|
|
writableTmpDirAsHomeHook,
|
|
}:
|
|
|
|
buildPythonPackage rec {
|
|
pname = "docling";
|
|
version = "2.41.0";
|
|
pyproject = true;
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "docling-project";
|
|
repo = "docling";
|
|
tag = "v${version}";
|
|
hash = "sha256-GD052HCqBLs+KUkOUOVdlXxS6+PD2pthGtz+zdQ6QnM=";
|
|
};
|
|
|
|
build-system = [
|
|
poetry-core
|
|
];
|
|
|
|
dependencies = [
|
|
accelerate
|
|
beautifulsoup4
|
|
certifi
|
|
docling-core
|
|
docling-ibm-models
|
|
docling-parse
|
|
easyocr
|
|
filetype
|
|
huggingface-hub
|
|
lxml
|
|
marko
|
|
# ocrmac # not yet packaged
|
|
onnxruntime
|
|
openpyxl
|
|
pandas
|
|
pillow
|
|
pluggy
|
|
pydantic
|
|
pydantic-settings
|
|
pylatexenc
|
|
pypdfium2
|
|
python-docx
|
|
python-pptx
|
|
rapidocr-onnxruntime
|
|
requests
|
|
rtree
|
|
scipy
|
|
tesserocr
|
|
tqdm
|
|
transformers
|
|
typer
|
|
];
|
|
|
|
pythonRelaxDeps = [
|
|
"pillow"
|
|
];
|
|
|
|
optional-dependencies = {
|
|
ocrmac = [
|
|
# ocrmac # not yet packaged
|
|
];
|
|
rapidocr = [
|
|
onnxruntime
|
|
rapidocr-onnxruntime
|
|
];
|
|
tesserocr = [
|
|
tesserocr
|
|
];
|
|
|
|
docs = [
|
|
# mkdocs-click # not yet packaged
|
|
mkdocs-jupyter
|
|
mkdocs-material
|
|
mkdocstrings
|
|
# griffle-pydantic
|
|
];
|
|
};
|
|
|
|
nativeCheckInputs = [
|
|
pytestCheckHook
|
|
writableTmpDirAsHomeHook
|
|
];
|
|
|
|
pythonImportsCheck = [
|
|
"docling"
|
|
];
|
|
|
|
disabledTests = [
|
|
"test_e2e_pdfs_conversions" # AssertionError: ## TableFormer: Table Structure Understanding with Transf
|
|
"test_e2e_conversions" # RuntimeError: Tesseract is not available
|
|
|
|
# AssertionError
|
|
# assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
|
|
"test_ordered_lists"
|
|
|
|
# AssertionError: export to md
|
|
"test_e2e_html_conversions"
|
|
|
|
# AssertionError: assert 'Unordered li...d code block:' == 'Unordered li...d code block:'
|
|
"test_convert_valid"
|
|
|
|
# AssertionError: Markdown file mismatch against groundtruth pftaps057006474.md
|
|
"test_patent_groundtruth"
|
|
|
|
# huggingface_hub.errors.LocalEntryNotFoundError: An error happened
|
|
"test_cli_convert"
|
|
"test_code_and_formula_conversion"
|
|
"test_picture_classifier"
|
|
"test_convert_path"
|
|
"test_convert_stream"
|
|
"test_compare_legacy_output"
|
|
"test_ocr_coverage_threshold"
|
|
"test_formula_conversion_with_page_range"
|
|
|
|
# requires network access
|
|
"test_page_range"
|
|
"test_parser_backends"
|
|
"test_confidence"
|
|
"test_e2e_webp_conversions"
|
|
"test_asr_pipeline_conversion"
|
|
|
|
# AssertionError: pred_itxt==true_itxt
|
|
"test_e2e_valid_csv_conversions"
|
|
];
|
|
|
|
meta = {
|
|
description = "Get your documents ready for gen AI";
|
|
homepage = "https://github.com/DS4SD/docling";
|
|
changelog = "https://github.com/DS4SD/docling/blob/${src.tag}/CHANGELOG.md";
|
|
license = lib.licenses.mit;
|
|
maintainers = with lib.maintainers; [ happysalada ];
|
|
mainProgram = "docling";
|
|
};
|
|
}
|