python3Packages.vllm: 0.6.2 -> 0.7.1 (#379165)

2025-02-04 15:09:02 +01:00 · 2025-02-04 15:09:02 +01:00 · ab36ef17c8
commit ab36ef17c8
parent 498fb7702f f8a077696b
8 changed files with 361 additions and 66 deletions
--- a/pkgs/development/python-modules/mistral-common/default.nix
+++ b/pkgs/development/python-modules/mistral-common/default.nix
@ -6,6 +6,7 @@
  numpy,
  pydantic,
  jsonschema,
+  opencv-python-headless,
  sentencepiece,
  typing-extensions,
  tiktoken,
@ -37,6 +38,7 @@ buildPythonPackage rec {
    numpy
    pydantic
    jsonschema
+    opencv-python-headless
    sentencepiece
    typing-extensions
    tiktoken
--- a/pkgs/development/python-modules/vllm/0001-setup.py-don-t-ask-for-hipcc-version.patch
+++ b/pkgs/development/python-modules/vllm/0001-setup.py-don-t-ask-for-hipcc-version.patch
@ -1,24 +0,0 @@
-From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001
-From: SomeoneSerge <else@someonex.net>
-Date: Wed, 31 Jul 2024 12:02:53 +0000
-Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version
-
---
- setup.py | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/setup.py b/setup.py
-index 72ef26f1..01e006f9 100644
--- a/setup.py
-+++ b/setup.py
-@@ -279,6 +279,7 @@ def _install_punica() -> bool:
- 
- 
- def get_hipcc_rocm_version():
-+    return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox
-     # Run the hipcc --version command
-     result = subprocess.run(['hipcc', '--version'],
-                             stdout=subprocess.PIPE,
-- 
-2.45.1
-
--- a/pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch
+++ b/pkgs/development/python-modules/vllm/0003-propagate-pythonpath.patch
@ -0,0 +1,12 @@
+diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
+index f5a02a5b..e830f987 100644
+--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
+@@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
+         returned = subprocess.run(
+             [sys.executable, "-m", "vllm.model_executor.models.registry"],
+             input=input_bytes,
+            env={'PYTHONPATH': ':'.join(sys.path)},
+             capture_output=True)
+ 
+         # check if the subprocess is successful
--- a/pkgs/development/python-modules/vllm/0004-drop-lsmod.patch
+++ b/pkgs/development/python-modules/vllm/0004-drop-lsmod.patch
@ -0,0 +1,18 @@
+--- a/setup.py
+++ b/setup.py
+@@ -340,14 +340,7 @@ def _is_hpu() -> bool:
+         out = subprocess.run(["hl-smi"], capture_output=True, check=True)
+         is_hpu_available = out.returncode == 0
+     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+-        if sys.platform.startswith("linux"):
+-            try:
+-                output = subprocess.check_output(
+-                    'lsmod | grep habanalabs | wc -l', shell=True)
+-                is_hpu_available = int(output) > 0
+-            except (ValueError, FileNotFoundError, PermissionError,
+-                    subprocess.CalledProcessError):
+-                pass
+        is_hpu_available = False
+     return is_hpu_available
+ 
+ 
--- a/pkgs/development/python-modules/vllm/default.nix
+++ b/pkgs/development/python-modules/vllm/default.nix
@ -5,14 +5,21 @@
  buildPythonPackage,
  pythonRelaxDepsHook,
  fetchFromGitHub,
+  symlinkJoin,
+  autoAddDriverRunpath,
+
+  # build system
+  packaging,
+  setuptools,
+  wheel,
+
+  # dependencies
  which,
  ninja,
  cmake,
-  packaging,
-  setuptools,
+  setuptools-scm,
  torch,
  outlines,
-  wheel,
  psutil,
  ray,
  pandas,
@ -21,43 +28,174 @@
  numpy,
  transformers,
  xformers,
+  xgrammar,
  fastapi,
  uvicorn,
  pydantic,
  aioprometheus,
+  pynvml,
  openai,
  pyzmq,
  tiktoken,
+  torchaudio,
  torchvision,
  py-cpuinfo,
  lm-format-enforcer,
  prometheus-fastapi-instrumentator,
  cupy,
-  writeShellScript,
+  gguf,
+  einops,
+  importlib-metadata,
+  partial-json-parser,
+  compressed-tensors,
+  mistral-common,
+  msgspec,
+  numactl,
+  tokenizers,
+  oneDNN,
+  blake3,
+  depyf,
+  opencv-python-headless,

  config,

  cudaSupport ? config.cudaSupport,
  cudaPackages ? { },
-
-  # Has to be either rocm or cuda, default to the free one
-  rocmSupport ? !config.cudaSupport,
+  rocmSupport ? config.rocmSupport,
  rocmPackages ? { },
  gpuTargets ? [ ],
 }@args:

 let
+  inherit (lib)
+    lists
+    strings
+    trivial
+    ;
+
+  inherit (cudaPackages) cudaFlags;
+
+  shouldUsePkg =
+    pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
+
+  # see CMakeLists.txt, grepping for GIT_TAG near cutlass
+  # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
  cutlass = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "cutlass";
-    rev = "refs/tags/v3.5.0";
-    sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
+    tag = "v3.7.0";
+    hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs=";
  };
+
+  vllm-flash-attn = stdenv.mkDerivation rec {
+    pname = "vllm-flash-attn";
+    version = "2.6.2";
+
+    # see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn
+    # https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
+    src = fetchFromGitHub {
+      owner = "vllm-project";
+      repo = "flash-attention";
+      rev = "d4e09037abf588af1ec47d0e966b237ee376876c";
+      hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII=";
+    };
+
+    dontConfigure = true;
+
+    # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
+    buildPhase = ''
+      rm -rf csrc/cutlass
+      ln -sf ${cutlass} csrc/cutlass
+    '';
+
+    installPhase = ''
+      cp -rva . $out
+    '';
+  };
+
+  cpuSupport = !cudaSupport && !rocmSupport;
+
+  # https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
+  supportedTorchCudaCapabilities =
+    let
+      real = [
+        "3.5"
+        "3.7"
+        "5.0"
+        "5.2"
+        "5.3"
+        "6.0"
+        "6.1"
+        "6.2"
+        "7.0"
+        "7.2"
+        "7.5"
+        "8.0"
+        "8.6"
+        "8.7"
+        "8.9"
+        "9.0"
+        "9.0a"
+      ];
+      ptx = lists.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+
+  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
+  #   of the first list *from* the second list. That means:
+  #   lists.subtractLists a b = b - a
+
+  # For CUDA
+  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
+  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
+
+  isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild;
+
+  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
+  gpuArchWarner =
+    supported: unsupported:
+    trivial.throwIf (supported == [ ]) (
+      "No supported GPU targets specified. Requested GPU targets: "
+      + strings.concatStringsSep ", " unsupported
+    ) supported;
+
+  # Create the gpuTargetString.
+  gpuTargetString = strings.concatStringsSep ";" (
+    if gpuTargets != [ ] then
+      # If gpuTargets is specified, it always takes priority.
+      gpuTargets
+    else if cudaSupport then
+      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
+    else if rocmSupport then
+      rocmPackages.clr.gpuTargets
+    else
+      throw "No GPU targets specified"
+  );
+
+  mergedCudaLibraries = with cudaPackages; [
+    cuda_cudart # cuda_runtime.h, -lcudart
+    cuda_cccl
+    libcusparse # cusparse.h
+    libcusolver # cusolverDn.h
+    cuda_nvtx
+    cuda_nvrtc
+    libcublas
+  ];
+
+  # Some packages are not available on all platforms
+  nccl = shouldUsePkg (cudaPackages.nccl or null);
+
+  getAllOutputs = p: [
+    (lib.getBin p)
+    (lib.getLib p)
+    (lib.getDev p)
+  ];
+
 in

 buildPythonPackage rec {
  pname = "vllm";
-  version = "0.6.2";
+  version = "0.7.1";
  pyproject = true;

  stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
@ -65,30 +203,54 @@ buildPythonPackage rec {
  src = fetchFromGitHub {
    owner = "vllm-project";
    repo = pname;
-    rev = "refs/tags/v${version}";
-    hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
+    tag = "v${version}";
+    hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg=";
  };

  patches = [
-    ./0001-setup.py-don-t-ask-for-hipcc-version.patch
    ./0002-setup.py-nix-support-respect-cmakeFlags.patch
+    ./0003-propagate-pythonpath.patch
+    ./0004-drop-lsmod.patch
  ];

  # Ignore the python version check because it hard-codes minor versions and
  # lags behind `ray`'s python interpreter support
-  postPatch = ''
-    substituteInPlace CMakeLists.txt \
-      --replace-fail \
-        'set(PYTHON_SUPPORTED_VERSIONS' \
-        'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
-  '';
+  postPatch =
+    ''
+      substituteInPlace CMakeLists.txt \
+        --replace-fail \
+          'set(PYTHON_SUPPORTED_VERSIONS' \
+          'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'

-  nativeBuildInputs = [
-    cmake
-    ninja
-    pythonRelaxDepsHook
-    which
-  ] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
+      # Relax torch dependency manually because the nonstandard requirements format
+      # is not caught by pythonRelaxDeps
+      substituteInPlace requirements*.txt pyproject.toml \
+        --replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \
+        --replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}'
+    ''
+    + lib.optionalString (nccl == null) ''
+      # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
+      substituteInPlace vllm/distributed/parallel_state.py \
+        --replace-fail '"nccl"' '"gloo"'
+    '';
+
+  nativeBuildInputs =
+    [
+      cmake
+      ninja
+      pythonRelaxDepsHook
+      which
+    ]
+    ++ lib.optionals rocmSupport [
+      rocmPackages.hipcc
+    ]
+    ++ lib.optionals cudaSupport [
+      cudaPackages.cuda_nvcc
+      autoAddDriverRunpath
+    ]
+    ++ lib.optionals isCudaJetson [
+      cudaPackages.autoAddCudaCompatRunpath
+    ];

  build-system = [
    packaging
@ -97,18 +259,22 @@ buildPythonPackage rec {
  ];

  buildInputs =
-    (lib.optionals cudaSupport (
-      with cudaPackages;
-      [
-        cuda_cudart # cuda_runtime.h, -lcudart
-        cuda_cccl
-        libcusparse # cusparse.h
-        libcusolver # cusolverDn.h
-        cuda_nvcc
-        cuda_nvtx
-        libcublas
-      ]
-    ))
+    [
+      setuptools-scm
+      torch
+    ]
+    ++ (lib.optionals cpuSupport ([
+      numactl
+      oneDNN
+    ]))
+    ++ (
+      lib.optionals cudaSupport mergedCudaLibraries
+      ++ (with cudaPackages; [
+        nccl
+        cudnn
+        libcufile
+      ])
+    )
    ++ (lib.optionals rocmSupport (
      with rocmPackages;
      [
@ -123,10 +289,13 @@ buildPythonPackage rec {
  dependencies =
    [
      aioprometheus
+      blake3
+      depyf
      fastapi
      lm-format-enforcer
      numpy
      openai
+      opencv-python-headless
      outlines
      pandas
      prometheus-fastapi-instrumentator
@ -138,27 +307,64 @@ buildPythonPackage rec {
      ray
      sentencepiece
      tiktoken
+      tokenizers
+      msgspec
+      gguf
+      einops
+      importlib-metadata
+      partial-json-parser
+      compressed-tensors
+      mistral-common
      torch
+      torchaudio
      torchvision
      transformers
      uvicorn
      xformers
+      xgrammar
    ]
    ++ uvicorn.optional-dependencies.standard
    ++ aioprometheus.optional-dependencies.starlette
    ++ lib.optionals cudaSupport [
      cupy
+      pynvml
    ];

  dontUseCmakeConfigure = true;
-  cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
+  cmakeFlags =
+    [
+      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
+      (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
+    ]
+    ++ lib.optionals cudaSupport [
+      (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
+      (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString
+      }")
+      (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
+        name = "cuda-merged-${cudaPackages.cudaVersion}";
+        paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
+      }}")
+      (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
+      (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
+      (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
+    ]
+    ++ lib.optionals cpuSupport [
+      (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
+    ];

  env =
-    lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
+    lib.optionalAttrs cudaSupport {
+      VLLM_TARGET_DEVICE = "cuda";
+      CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
+    }
    // lib.optionalAttrs rocmSupport {
+      VLLM_TARGET_DEVICE = "rocm";
      # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
      PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
      ROCM_HOME = "${rocmPackages.clr}";
+    }
+    // lib.optionalAttrs cpuSupport {
+      VLLM_TARGET_DEVICE = "cpu";
    };

  pythonRelaxDeps = true;
@ -174,8 +380,8 @@ buildPythonPackage rec {
      happysalada
      lach
    ];
-    # RuntimeError: Unknown runtime environment
-    broken = true;
-    # broken = !cudaSupport && !rocmSupport;
+
+    # CPU support relies on unpackaged dependency `intel_extension_for_pytorch`
+    broken = cpuSupport;
  };
 }
--- a/pkgs/development/python-modules/xgrammar/default.nix
+++ b/pkgs/development/python-modules/xgrammar/default.nix
@ -0,0 +1,77 @@
+{
+  lib,
+  stdenv,
+  buildPythonPackage,
+  fetchPypi,
+  python,
+  pythonOlder,
+  pythonAtLeast,
+  pydantic,
+  sentencepiece,
+  tiktoken,
+  torch,
+  transformers,
+  triton,
+}:
+
+let
+  pyShortVersion = "cp" + builtins.replaceStrings [ "." ] [ "" ] python.pythonVersion;
+  platforms = rec {
+    aarch64-darwin = "macosx_13_0_arm64";
+    x86_64-darwin = "macosx_10_15_x86_64";
+    x86_64-linux = "manylinux_2_27_x86_64.manylinux_2_28_x86_64";
+  };
+  platform = platforms.${stdenv.system} or (throw "Unsupported system: ${stdenv.system}");
+  # hashes retrieved via the following command
+  # curl https://pypi.org/pypi/xgrammar/${version}/json | jq -r '.urls[] | "\(.digests.sha256)  \(.filename)"'
+  hashes = rec {
+    cp39-aarch64-darwin = "12dd579a7073c14981e01aeee566d20e60001bf90af23024e0e6692a770ff535";
+    cp39-x86_64-darwin = "035ec93306543b99bf2141dcc7f1a6dd0c255753fc8b5a2b5f3289a59fed8e37";
+    cp39-x86_64-linux = "3b3975dcf4b3ed7b16bbe3c068738b09847f841793e1c5e1b4a07dff36bbdc37";
+    cp310-aarch64-darwin = "93bb6c10cbdf1a2bda3b458d97b47436657d780f98dccf3d266e17e13568c0a9";
+    cp310-x86_64-darwin = "5ed31db2669dc499d9d29bb16f30b3395332ff9d0fb80b759697190a5ef5258b";
+    cp310-x86_64-linux = "9c6f571121e4af45e3b5dc55f3dadd751cffff1f85f1c6fc5c4276db2bbed222";
+    cp311-aarch64-darwin = "b293443725eddad31cf7b407bb24d5f3156c4b12a2c8041743cb7068a69fadcb";
+    cp311-x86_64-darwin = "b2106bceb2ce313628af915f2c2b1c9865612026dd3c9feddbfcc69e4ee6c971";
+    cp311-x86_64-linux = "7934c968371d55759cac35be3b218cdf4b13f323f535ea0faa233240bab803b9";
+    cp312-aarch64-darwin = "561f8d4307db8cf5d3c3b3ff46eda6d95379f6e801278dbf9153a9d5e8b6126c";
+    cp312-x86_64-darwin = "6ac3cbb0a82a3a9d07f0739f63b2e26cbef7855149d236057dcc7fee74b37970";
+    cp312-x86_64-linux = "1854d0fe6b908a3d2d42251a62e627224dbf6035a4322b844b1b5a277e3d0461";
+  };
+  hash =
+    hashes."${pyShortVersion}-${stdenv.system}"
+      or (throw "Unsupported Python version: ${python.pythonVersion}");
+in
+buildPythonPackage rec {
+  pname = "xgrammar";
+  version = "0.1.11";
+  format = "wheel";
+
+  disabled = pythonOlder "3.9" || pythonAtLeast "3.13";
+
+  src = fetchPypi {
+    inherit pname version format;
+    dist = pyShortVersion;
+    python = pyShortVersion;
+    abi = pyShortVersion;
+    platform = platform;
+    sha256 = hash;
+  };
+
+  pythonImportCheck = [ "xgrammar" ];
+
+  dependencies = [
+    pydantic
+    sentencepiece
+    tiktoken
+    torch
+    transformers
+    triton
+  ];
+
+  meta = with lib; {
+    description = "Efficient, Flexible and Portable Structured Generation";
+    homepage = "https://xgrammar.mlc.ai";
+    license = licenses.asl20;
+  };
+}
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@ -10744,6 +10744,8 @@ with pkgs;
    openexr = openexr_3;
  };

+  vllm = with python3Packages; toPythonApplication vllm;
+
  vte-gtk4 = vte.override {
    gtkVersion = "4";
  };
--- a/pkgs/top-level/python-packages.nix
+++ b/pkgs/top-level/python-packages.nix
@ -18365,6 +18365,8 @@ self: super: with self; {
    inherit (pkgs) xgboost;
  };

+  xgrammar = callPackage ../development/python-modules/xgrammar { };
+
  xhtml2pdf = callPackage ../development/python-modules/xhtml2pdf { };

  xiaomi-ble = callPackage ../development/python-modules/xiaomi-ble { };