python3Packages.vllm: 0.6.2 -> 0.7.1 (#379165)

This commit is contained in:
Pavol Rusnak 2025-02-04 15:09:02 +01:00 committed by GitHub
commit ab36ef17c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 361 additions and 66 deletions

View File

@ -6,6 +6,7 @@
numpy, numpy,
pydantic, pydantic,
jsonschema, jsonschema,
opencv-python-headless,
sentencepiece, sentencepiece,
typing-extensions, typing-extensions,
tiktoken, tiktoken,
@ -37,6 +38,7 @@ buildPythonPackage rec {
numpy numpy
pydantic pydantic
jsonschema jsonschema
opencv-python-headless
sentencepiece sentencepiece
typing-extensions typing-extensions
tiktoken tiktoken

View File

@ -1,24 +0,0 @@
From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001
From: SomeoneSerge <else@someonex.net>
Date: Wed, 31 Jul 2024 12:02:53 +0000
Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index 72ef26f1..01e006f9 100644
--- a/setup.py
+++ b/setup.py
@@ -279,6 +279,7 @@ def _install_punica() -> bool:
def get_hipcc_rocm_version():
+ return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox
# Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'],
stdout=subprocess.PIPE,
--
2.45.1

View File

@ -0,0 +1,12 @@
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f5a02a5b..e830f987 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
returned = subprocess.run(
[sys.executable, "-m", "vllm.model_executor.models.registry"],
input=input_bytes,
+ env={'PYTHONPATH': ':'.join(sys.path)},
capture_output=True)
# check if the subprocess is successful

View File

@ -0,0 +1,18 @@
--- a/setup.py
+++ b/setup.py
@@ -340,14 +340,7 @@ def _is_hpu() -> bool:
out = subprocess.run(["hl-smi"], capture_output=True, check=True)
is_hpu_available = out.returncode == 0
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
- if sys.platform.startswith("linux"):
- try:
- output = subprocess.check_output(
- 'lsmod | grep habanalabs | wc -l', shell=True)
- is_hpu_available = int(output) > 0
- except (ValueError, FileNotFoundError, PermissionError,
- subprocess.CalledProcessError):
- pass
+ is_hpu_available = False
return is_hpu_available

View File

@ -5,14 +5,21 @@
buildPythonPackage, buildPythonPackage,
pythonRelaxDepsHook, pythonRelaxDepsHook,
fetchFromGitHub, fetchFromGitHub,
symlinkJoin,
autoAddDriverRunpath,
# build system
packaging,
setuptools,
wheel,
# dependencies
which, which,
ninja, ninja,
cmake, cmake,
packaging, setuptools-scm,
setuptools,
torch, torch,
outlines, outlines,
wheel,
psutil, psutil,
ray, ray,
pandas, pandas,
@ -21,43 +28,174 @@
numpy, numpy,
transformers, transformers,
xformers, xformers,
xgrammar,
fastapi, fastapi,
uvicorn, uvicorn,
pydantic, pydantic,
aioprometheus, aioprometheus,
pynvml,
openai, openai,
pyzmq, pyzmq,
tiktoken, tiktoken,
torchaudio,
torchvision, torchvision,
py-cpuinfo, py-cpuinfo,
lm-format-enforcer, lm-format-enforcer,
prometheus-fastapi-instrumentator, prometheus-fastapi-instrumentator,
cupy, cupy,
writeShellScript, gguf,
einops,
importlib-metadata,
partial-json-parser,
compressed-tensors,
mistral-common,
msgspec,
numactl,
tokenizers,
oneDNN,
blake3,
depyf,
opencv-python-headless,
config, config,
cudaSupport ? config.cudaSupport, cudaSupport ? config.cudaSupport,
cudaPackages ? { }, cudaPackages ? { },
rocmSupport ? config.rocmSupport,
# Has to be either rocm or cuda, default to the free one
rocmSupport ? !config.cudaSupport,
rocmPackages ? { }, rocmPackages ? { },
gpuTargets ? [ ], gpuTargets ? [ ],
}@args: }@args:
let let
inherit (lib)
lists
strings
trivial
;
inherit (cudaPackages) cudaFlags;
shouldUsePkg =
pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
# see CMakeLists.txt, grepping for GIT_TAG near cutlass
# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
cutlass = fetchFromGitHub { cutlass = fetchFromGitHub {
owner = "NVIDIA"; owner = "NVIDIA";
repo = "cutlass"; repo = "cutlass";
rev = "refs/tags/v3.5.0"; tag = "v3.7.0";
sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4="; hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs=";
}; };
vllm-flash-attn = stdenv.mkDerivation rec {
pname = "vllm-flash-attn";
version = "2.6.2";
# see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn
# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
src = fetchFromGitHub {
owner = "vllm-project";
repo = "flash-attention";
rev = "d4e09037abf588af1ec47d0e966b237ee376876c";
hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII=";
};
dontConfigure = true;
# vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
buildPhase = ''
rm -rf csrc/cutlass
ln -sf ${cutlass} csrc/cutlass
'';
installPhase = ''
cp -rva . $out
'';
};
cpuSupport = !cudaSupport && !rocmSupport;
# https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
supportedTorchCudaCapabilities =
let
real = [
"3.5"
"3.7"
"5.0"
"5.2"
"5.3"
"6.0"
"6.1"
"6.2"
"7.0"
"7.2"
"7.5"
"8.0"
"8.6"
"8.7"
"8.9"
"9.0"
"9.0a"
];
ptx = lists.map (x: "${x}+PTX") real;
in
real ++ ptx;
# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
# of the first list *from* the second list. That means:
# lists.subtractLists a b = b - a
# For CUDA
supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild;
# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
gpuArchWarner =
supported: unsupported:
trivial.throwIf (supported == [ ]) (
"No supported GPU targets specified. Requested GPU targets: "
+ strings.concatStringsSep ", " unsupported
) supported;
# Create the gpuTargetString.
gpuTargetString = strings.concatStringsSep ";" (
if gpuTargets != [ ] then
# If gpuTargets is specified, it always takes priority.
gpuTargets
else if cudaSupport then
gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
else if rocmSupport then
rocmPackages.clr.gpuTargets
else
throw "No GPU targets specified"
);
mergedCudaLibraries = with cudaPackages; [
cuda_cudart # cuda_runtime.h, -lcudart
cuda_cccl
libcusparse # cusparse.h
libcusolver # cusolverDn.h
cuda_nvtx
cuda_nvrtc
libcublas
];
# Some packages are not available on all platforms
nccl = shouldUsePkg (cudaPackages.nccl or null);
getAllOutputs = p: [
(lib.getBin p)
(lib.getLib p)
(lib.getDev p)
];
in in
buildPythonPackage rec { buildPythonPackage rec {
pname = "vllm"; pname = "vllm";
version = "0.6.2"; version = "0.7.1";
pyproject = true; pyproject = true;
stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv; stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
@ -65,30 +203,54 @@ buildPythonPackage rec {
src = fetchFromGitHub { src = fetchFromGitHub {
owner = "vllm-project"; owner = "vllm-project";
repo = pname; repo = pname;
rev = "refs/tags/v${version}"; tag = "v${version}";
hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo="; hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg=";
}; };
patches = [ patches = [
./0001-setup.py-don-t-ask-for-hipcc-version.patch
./0002-setup.py-nix-support-respect-cmakeFlags.patch ./0002-setup.py-nix-support-respect-cmakeFlags.patch
./0003-propagate-pythonpath.patch
./0004-drop-lsmod.patch
]; ];
# Ignore the python version check because it hard-codes minor versions and # Ignore the python version check because it hard-codes minor versions and
# lags behind `ray`'s python interpreter support # lags behind `ray`'s python interpreter support
postPatch = '' postPatch =
substituteInPlace CMakeLists.txt \ ''
--replace-fail \ substituteInPlace CMakeLists.txt \
'set(PYTHON_SUPPORTED_VERSIONS' \ --replace-fail \
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 'set(PYTHON_SUPPORTED_VERSIONS' \
''; 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
nativeBuildInputs = [ # Relax torch dependency manually because the nonstandard requirements format
cmake # is not caught by pythonRelaxDeps
ninja substituteInPlace requirements*.txt pyproject.toml \
pythonRelaxDepsHook --replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \
which --replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}'
] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ]; ''
+ lib.optionalString (nccl == null) ''
# On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
substituteInPlace vllm/distributed/parallel_state.py \
--replace-fail '"nccl"' '"gloo"'
'';
nativeBuildInputs =
[
cmake
ninja
pythonRelaxDepsHook
which
]
++ lib.optionals rocmSupport [
rocmPackages.hipcc
]
++ lib.optionals cudaSupport [
cudaPackages.cuda_nvcc
autoAddDriverRunpath
]
++ lib.optionals isCudaJetson [
cudaPackages.autoAddCudaCompatRunpath
];
build-system = [ build-system = [
packaging packaging
@ -97,18 +259,22 @@ buildPythonPackage rec {
]; ];
buildInputs = buildInputs =
(lib.optionals cudaSupport ( [
with cudaPackages; setuptools-scm
[ torch
cuda_cudart # cuda_runtime.h, -lcudart ]
cuda_cccl ++ (lib.optionals cpuSupport ([
libcusparse # cusparse.h numactl
libcusolver # cusolverDn.h oneDNN
cuda_nvcc ]))
cuda_nvtx ++ (
libcublas lib.optionals cudaSupport mergedCudaLibraries
] ++ (with cudaPackages; [
)) nccl
cudnn
libcufile
])
)
++ (lib.optionals rocmSupport ( ++ (lib.optionals rocmSupport (
with rocmPackages; with rocmPackages;
[ [
@ -123,10 +289,13 @@ buildPythonPackage rec {
dependencies = dependencies =
[ [
aioprometheus aioprometheus
blake3
depyf
fastapi fastapi
lm-format-enforcer lm-format-enforcer
numpy numpy
openai openai
opencv-python-headless
outlines outlines
pandas pandas
prometheus-fastapi-instrumentator prometheus-fastapi-instrumentator
@ -138,27 +307,64 @@ buildPythonPackage rec {
ray ray
sentencepiece sentencepiece
tiktoken tiktoken
tokenizers
msgspec
gguf
einops
importlib-metadata
partial-json-parser
compressed-tensors
mistral-common
torch torch
torchaudio
torchvision torchvision
transformers transformers
uvicorn uvicorn
xformers xformers
xgrammar
] ]
++ uvicorn.optional-dependencies.standard ++ uvicorn.optional-dependencies.standard
++ aioprometheus.optional-dependencies.starlette ++ aioprometheus.optional-dependencies.starlette
++ lib.optionals cudaSupport [ ++ lib.optionals cudaSupport [
cupy cupy
pynvml
]; ];
dontUseCmakeConfigure = true; dontUseCmakeConfigure = true;
cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ]; cmakeFlags =
[
(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
(lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
]
++ lib.optionals cudaSupport [
(lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
(lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString
}")
(lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
name = "cuda-merged-${cudaPackages.cudaVersion}";
paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
}}")
(lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
(lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
(lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
]
++ lib.optionals cpuSupport [
(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
];
env = env =
lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; } lib.optionalAttrs cudaSupport {
VLLM_TARGET_DEVICE = "cuda";
CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
}
// lib.optionalAttrs rocmSupport { // lib.optionalAttrs rocmSupport {
VLLM_TARGET_DEVICE = "rocm";
# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
ROCM_HOME = "${rocmPackages.clr}"; ROCM_HOME = "${rocmPackages.clr}";
}
// lib.optionalAttrs cpuSupport {
VLLM_TARGET_DEVICE = "cpu";
}; };
pythonRelaxDeps = true; pythonRelaxDeps = true;
@ -174,8 +380,8 @@ buildPythonPackage rec {
happysalada happysalada
lach lach
]; ];
# RuntimeError: Unknown runtime environment
broken = true; # CPU support relies on unpackaged dependency `intel_extension_for_pytorch`
# broken = !cudaSupport && !rocmSupport; broken = cpuSupport;
}; };
} }

View File

@ -0,0 +1,77 @@
{
lib,
stdenv,
buildPythonPackage,
fetchPypi,
python,
pythonOlder,
pythonAtLeast,
pydantic,
sentencepiece,
tiktoken,
torch,
transformers,
triton,
}:
let
pyShortVersion = "cp" + builtins.replaceStrings [ "." ] [ "" ] python.pythonVersion;
platforms = rec {
aarch64-darwin = "macosx_13_0_arm64";
x86_64-darwin = "macosx_10_15_x86_64";
x86_64-linux = "manylinux_2_27_x86_64.manylinux_2_28_x86_64";
};
platform = platforms.${stdenv.system} or (throw "Unsupported system: ${stdenv.system}");
# hashes retrieved via the following command
# curl https://pypi.org/pypi/xgrammar/${version}/json | jq -r '.urls[] | "\(.digests.sha256) \(.filename)"'
hashes = rec {
cp39-aarch64-darwin = "12dd579a7073c14981e01aeee566d20e60001bf90af23024e0e6692a770ff535";
cp39-x86_64-darwin = "035ec93306543b99bf2141dcc7f1a6dd0c255753fc8b5a2b5f3289a59fed8e37";
cp39-x86_64-linux = "3b3975dcf4b3ed7b16bbe3c068738b09847f841793e1c5e1b4a07dff36bbdc37";
cp310-aarch64-darwin = "93bb6c10cbdf1a2bda3b458d97b47436657d780f98dccf3d266e17e13568c0a9";
cp310-x86_64-darwin = "5ed31db2669dc499d9d29bb16f30b3395332ff9d0fb80b759697190a5ef5258b";
cp310-x86_64-linux = "9c6f571121e4af45e3b5dc55f3dadd751cffff1f85f1c6fc5c4276db2bbed222";
cp311-aarch64-darwin = "b293443725eddad31cf7b407bb24d5f3156c4b12a2c8041743cb7068a69fadcb";
cp311-x86_64-darwin = "b2106bceb2ce313628af915f2c2b1c9865612026dd3c9feddbfcc69e4ee6c971";
cp311-x86_64-linux = "7934c968371d55759cac35be3b218cdf4b13f323f535ea0faa233240bab803b9";
cp312-aarch64-darwin = "561f8d4307db8cf5d3c3b3ff46eda6d95379f6e801278dbf9153a9d5e8b6126c";
cp312-x86_64-darwin = "6ac3cbb0a82a3a9d07f0739f63b2e26cbef7855149d236057dcc7fee74b37970";
cp312-x86_64-linux = "1854d0fe6b908a3d2d42251a62e627224dbf6035a4322b844b1b5a277e3d0461";
};
hash =
hashes."${pyShortVersion}-${stdenv.system}"
or (throw "Unsupported Python version: ${python.pythonVersion}");
in
buildPythonPackage rec {
pname = "xgrammar";
version = "0.1.11";
format = "wheel";
disabled = pythonOlder "3.9" || pythonAtLeast "3.13";
src = fetchPypi {
inherit pname version format;
dist = pyShortVersion;
python = pyShortVersion;
abi = pyShortVersion;
platform = platform;
sha256 = hash;
};
pythonImportCheck = [ "xgrammar" ];
dependencies = [
pydantic
sentencepiece
tiktoken
torch
transformers
triton
];
meta = with lib; {
description = "Efficient, Flexible and Portable Structured Generation";
homepage = "https://xgrammar.mlc.ai";
license = licenses.asl20;
};
}

View File

@ -10744,6 +10744,8 @@ with pkgs;
openexr = openexr_3; openexr = openexr_3;
}; };
vllm = with python3Packages; toPythonApplication vllm;
vte-gtk4 = vte.override { vte-gtk4 = vte.override {
gtkVersion = "4"; gtkVersion = "4";
}; };

View File

@ -18365,6 +18365,8 @@ self: super: with self; {
inherit (pkgs) xgboost; inherit (pkgs) xgboost;
}; };
xgrammar = callPackage ../development/python-modules/xgrammar { };
xhtml2pdf = callPackage ../development/python-modules/xhtml2pdf { }; xhtml2pdf = callPackage ../development/python-modules/xhtml2pdf { };
xiaomi-ble = callPackage ../development/python-modules/xiaomi-ble { }; xiaomi-ble = callPackage ../development/python-modules/xiaomi-ble { };