nixpkgs/pkgs/development/python-modules/llama-index-core/default.nix

{
  lib,
  aiohttp,
  aiosqlite,
  banks,
  buildPythonPackage,
  dataclasses-json,
  deprecated,
  dirtyjson,
  fetchFromGitHub,
  filetype,
  fsspec,
  hatchling,
  jsonpath-ng,
  llama-index-workflows,
  llamaindex-py-client,
  nest-asyncio,
  networkx,
  nltk-data,
  nltk,
  numpy,
  openai,
  pandas,
  pillow,
  pytest-asyncio,
  pytest-mock,
  pytestCheckHook,
  pythonOlder,
  pyvis,
  pyyaml,
  requests,
  spacy,
  sqlalchemy,
  tenacity,
  tiktoken,
  tree-sitter,
  typing-inspect,
}:

buildPythonPackage rec {
  pname = "llama-index-core";
  version = "0.12.46";
  pyproject = true;

  disabled = pythonOlder "3.9";

  src = fetchFromGitHub {
    owner = "run-llama";
    repo = "llama_index";
    tag = "v${version}";
    hash = "sha256-B1i5zabacapc/ipPTQtQzLVZql5ifqxfFoDhaBR+eYc=";
  };

  sourceRoot = "${src.name}/${pname}";

  # When `llama-index` is imported, it uses `nltk` to look for the following files and tries to
  # download them if they aren't present.
  # https://github.com/run-llama/llama_index/blob/6efa53cebd5c8ccf363582c932fffde44d61332e/llama-index-core/llama_index/core/utils.py#L59-L67
  # Setting `NLTK_DATA` to a writable path can also solve this problem, but it needs to be done in
  # every package that depends on `llama-index-core` for `pythonImportsCheck` not to fail, so this
  # solution seems more elegant.
  postPatch = ''
    mkdir -p llama_index/core/_static/nltk_cache/corpora/stopwords/
    cp -r ${nltk-data.stopwords}/corpora/stopwords/* llama_index/core/_static/nltk_cache/corpora/stopwords/

    mkdir -p llama_index/core/_static/nltk_cache/tokenizers/punkt/
    cp -r ${nltk-data.punkt}/tokenizers/punkt/* llama_index/core/_static/nltk_cache/tokenizers/punkt/
  '';

  pythonRelaxDeps = [
    "setuptools"
    "tenacity"
  ];

  build-system = [ hatchling ];

  dependencies = [
    aiohttp
    aiosqlite
    banks
    dataclasses-json
    deprecated
    dirtyjson
    filetype
    fsspec
    jsonpath-ng
    llama-index-workflows
    llamaindex-py-client
    nest-asyncio
    networkx
    nltk
    numpy
    openai
    pandas
    pillow
    pyvis
    pyyaml
    requests
    spacy
    sqlalchemy
    tenacity
    tiktoken
    typing-inspect
  ];

  nativeCheckInputs = [
    tree-sitter
    pytest-asyncio
    pytest-mock
    pytestCheckHook
  ];

  pythonImportsCheck = [ "llama_index" ];

  disabledTestPaths = [
    # Tests require network access
    "tests/agent/"
    "tests/callbacks/"
    "tests/chat_engine/"
    "tests/evaluation/"
    "tests/indices/"
    "tests/ingestion/"
    "tests/memory/"
    "tests/node_parser/"
    "tests/objects/"
    "tests/playground/"
    "tests/postprocessor/"
    "tests/query_engine/"
    "tests/question_gen/"
    "tests/response_synthesizers/"
    "tests/retrievers/"
    "tests/selectors/"
    "tests/test_utils.py"
    "tests/text_splitter/"
    "tests/token_predictor/"
    "tests/tools/"
    "tests/schema/"
    "tests/multi_modal_llms/"
  ];

  disabledTests = [
    # Tests require network access
    "test_context_extraction_basic"
    "test_context_extraction_custom_prompt"
    "test_context_extraction_oversized_document"
    "test_document_block_from_b64"
    "test_document_block_from_bytes"
    "test_document_block_from_path"
    "test_document_block_from_url"
    "test_from_namespaced_persist_dir"
    "test_from_persist_dir"
    "test_mimetype_raw_data"
    "test_multiple_documents_context"
    "test_resource"
    # asyncio.exceptions.InvalidStateError: invalid state
    "test_workflow_context_to_dict_mid_run"
    "test_SimpleDirectoryReader"
    # RuntimeError
    "test_str"
  ];

  meta = with lib; {
    description = "Data framework for your LLM applications";
    homepage = "https://github.com/run-llama/llama_index/";
    changelog = "https://github.com/run-llama/llama_index/blob/${src.tag}/CHANGELOG.md";
    license = licenses.mit;
    maintainers = with maintainers; [ fab ];
  };
}