python3Packages.vllm: 0.6.2 -> 0.7.1 (#379165)

This commit is contained in:
Pavol Rusnak 2025-02-04 15:09:02 +01:00 committed by GitHub
commit ab36ef17c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 361 additions and 66 deletions

View File

@ -6,6 +6,7 @@
numpy,
pydantic,
jsonschema,
opencv-python-headless,
sentencepiece,
typing-extensions,
tiktoken,
@ -37,6 +38,7 @@ buildPythonPackage rec {
numpy
pydantic
jsonschema
opencv-python-headless
sentencepiece
typing-extensions
tiktoken

View File

@ -1,24 +0,0 @@
From f6a7748bee79fc2e1898968fef844daacfa7860b Mon Sep 17 00:00:00 2001
From: SomeoneSerge <else@someonex.net>
Date: Wed, 31 Jul 2024 12:02:53 +0000
Subject: [PATCH 1/2] setup.py: don't ask for hipcc --version
---
setup.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.py b/setup.py
index 72ef26f1..01e006f9 100644
--- a/setup.py
+++ b/setup.py
@@ -279,6 +279,7 @@ def _install_punica() -> bool:
def get_hipcc_rocm_version():
+ return "0.0" # `hipcc --version` misbehaves ("unresolved paths") inside the nix sandbox
# Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'],
stdout=subprocess.PIPE,
--
2.45.1

View File

@ -0,0 +1,12 @@
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f5a02a5b..e830f987 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -482,6 +482,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
returned = subprocess.run(
[sys.executable, "-m", "vllm.model_executor.models.registry"],
input=input_bytes,
+ env={'PYTHONPATH': ':'.join(sys.path)},
capture_output=True)
# check if the subprocess is successful

View File

@ -0,0 +1,18 @@
--- a/setup.py
+++ b/setup.py
@@ -340,14 +340,7 @@ def _is_hpu() -> bool:
out = subprocess.run(["hl-smi"], capture_output=True, check=True)
is_hpu_available = out.returncode == 0
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
- if sys.platform.startswith("linux"):
- try:
- output = subprocess.check_output(
- 'lsmod | grep habanalabs | wc -l', shell=True)
- is_hpu_available = int(output) > 0
- except (ValueError, FileNotFoundError, PermissionError,
- subprocess.CalledProcessError):
- pass
+ is_hpu_available = False
return is_hpu_available

View File

@ -5,14 +5,21 @@
buildPythonPackage,
pythonRelaxDepsHook,
fetchFromGitHub,
symlinkJoin,
autoAddDriverRunpath,
# build system
packaging,
setuptools,
wheel,
# dependencies
which,
ninja,
cmake,
packaging,
setuptools,
setuptools-scm,
torch,
outlines,
wheel,
psutil,
ray,
pandas,
@ -21,43 +28,174 @@
numpy,
transformers,
xformers,
xgrammar,
fastapi,
uvicorn,
pydantic,
aioprometheus,
pynvml,
openai,
pyzmq,
tiktoken,
torchaudio,
torchvision,
py-cpuinfo,
lm-format-enforcer,
prometheus-fastapi-instrumentator,
cupy,
writeShellScript,
gguf,
einops,
importlib-metadata,
partial-json-parser,
compressed-tensors,
mistral-common,
msgspec,
numactl,
tokenizers,
oneDNN,
blake3,
depyf,
opencv-python-headless,
config,
cudaSupport ? config.cudaSupport,
cudaPackages ? { },
# Has to be either rocm or cuda, default to the free one
rocmSupport ? !config.cudaSupport,
rocmSupport ? config.rocmSupport,
rocmPackages ? { },
gpuTargets ? [ ],
}@args:
let
inherit (lib)
lists
strings
trivial
;
inherit (cudaPackages) cudaFlags;
shouldUsePkg =
pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null;
# see CMakeLists.txt, grepping for GIT_TAG near cutlass
# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
cutlass = fetchFromGitHub {
owner = "NVIDIA";
repo = "cutlass";
rev = "refs/tags/v3.5.0";
sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
tag = "v3.7.0";
hash = "sha256-GUTRXmv3DiM/GN5Bvv2LYovMLKZMlMhoKv4O0g627gs=";
};
vllm-flash-attn = stdenv.mkDerivation rec {
pname = "vllm-flash-attn";
version = "2.6.2";
# see CMakeLists.txt, grepping for GIT_TAG near vllm-flash-attn
# https://github.com/vllm-project/vllm/blob/${version}/CMakeLists.txt
src = fetchFromGitHub {
owner = "vllm-project";
repo = "flash-attention";
rev = "d4e09037abf588af1ec47d0e966b237ee376876c";
hash = "sha256-KFEsZlrwvCgvPzQ/pCLWcnbGq89mWE3yTDdtJSV9MII=";
};
dontConfigure = true;
# vllm-flash-attn normally relies on `git submodule update` to fetch cutlass
buildPhase = ''
rm -rf csrc/cutlass
ln -sf ${cutlass} csrc/cutlass
'';
installPhase = ''
cp -rva . $out
'';
};
cpuSupport = !cudaSupport && !rocmSupport;
# https://github.com/pytorch/pytorch/blob/v2.4.0/torch/utils/cpp_extension.py#L1953
supportedTorchCudaCapabilities =
let
real = [
"3.5"
"3.7"
"5.0"
"5.2"
"5.3"
"6.0"
"6.1"
"6.2"
"7.0"
"7.2"
"7.5"
"8.0"
"8.6"
"8.7"
"8.9"
"9.0"
"9.0a"
];
ptx = lists.map (x: "${x}+PTX") real;
in
real ++ ptx;
# NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
# of the first list *from* the second list. That means:
# lists.subtractLists a b = b - a
# For CUDA
supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
isCudaJetson = cudaSupport && cudaPackages.cudaFlags.isJetsonBuild;
# Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
gpuArchWarner =
supported: unsupported:
trivial.throwIf (supported == [ ]) (
"No supported GPU targets specified. Requested GPU targets: "
+ strings.concatStringsSep ", " unsupported
) supported;
# Create the gpuTargetString.
gpuTargetString = strings.concatStringsSep ";" (
if gpuTargets != [ ] then
# If gpuTargets is specified, it always takes priority.
gpuTargets
else if cudaSupport then
gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
else if rocmSupport then
rocmPackages.clr.gpuTargets
else
throw "No GPU targets specified"
);
mergedCudaLibraries = with cudaPackages; [
cuda_cudart # cuda_runtime.h, -lcudart
cuda_cccl
libcusparse # cusparse.h
libcusolver # cusolverDn.h
cuda_nvtx
cuda_nvrtc
libcublas
];
# Some packages are not available on all platforms
nccl = shouldUsePkg (cudaPackages.nccl or null);
getAllOutputs = p: [
(lib.getBin p)
(lib.getLib p)
(lib.getDev p)
];
in
buildPythonPackage rec {
pname = "vllm";
version = "0.6.2";
version = "0.7.1";
pyproject = true;
stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
@ -65,30 +203,54 @@ buildPythonPackage rec {
src = fetchFromGitHub {
owner = "vllm-project";
repo = pname;
rev = "refs/tags/v${version}";
hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
tag = "v${version}";
hash = "sha256-CImXKMEv+jHqngvcr8W6fQLiCo1mqmcZ0Ho0bfAgfbg=";
};
patches = [
./0001-setup.py-don-t-ask-for-hipcc-version.patch
./0002-setup.py-nix-support-respect-cmakeFlags.patch
./0003-propagate-pythonpath.patch
./0004-drop-lsmod.patch
];
# Ignore the python version check because it hard-codes minor versions and
# lags behind `ray`'s python interpreter support
postPatch = ''
substituteInPlace CMakeLists.txt \
--replace-fail \
'set(PYTHON_SUPPORTED_VERSIONS' \
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
'';
postPatch =
''
substituteInPlace CMakeLists.txt \
--replace-fail \
'set(PYTHON_SUPPORTED_VERSIONS' \
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
nativeBuildInputs = [
cmake
ninja
pythonRelaxDepsHook
which
] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
# Relax torch dependency manually because the nonstandard requirements format
# is not caught by pythonRelaxDeps
substituteInPlace requirements*.txt pyproject.toml \
--replace-warn 'torch==2.5.1' 'torch==${lib.getVersion torch}' \
--replace-warn 'torch == 2.5.1' 'torch == ${lib.getVersion torch}'
''
+ lib.optionalString (nccl == null) ''
# On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch)
substituteInPlace vllm/distributed/parallel_state.py \
--replace-fail '"nccl"' '"gloo"'
'';
nativeBuildInputs =
[
cmake
ninja
pythonRelaxDepsHook
which
]
++ lib.optionals rocmSupport [
rocmPackages.hipcc
]
++ lib.optionals cudaSupport [
cudaPackages.cuda_nvcc
autoAddDriverRunpath
]
++ lib.optionals isCudaJetson [
cudaPackages.autoAddCudaCompatRunpath
];
build-system = [
packaging
@ -97,18 +259,22 @@ buildPythonPackage rec {
];
buildInputs =
(lib.optionals cudaSupport (
with cudaPackages;
[
cuda_cudart # cuda_runtime.h, -lcudart
cuda_cccl
libcusparse # cusparse.h
libcusolver # cusolverDn.h
cuda_nvcc
cuda_nvtx
libcublas
]
))
[
setuptools-scm
torch
]
++ (lib.optionals cpuSupport ([
numactl
oneDNN
]))
++ (
lib.optionals cudaSupport mergedCudaLibraries
++ (with cudaPackages; [
nccl
cudnn
libcufile
])
)
++ (lib.optionals rocmSupport (
with rocmPackages;
[
@ -123,10 +289,13 @@ buildPythonPackage rec {
dependencies =
[
aioprometheus
blake3
depyf
fastapi
lm-format-enforcer
numpy
openai
opencv-python-headless
outlines
pandas
prometheus-fastapi-instrumentator
@ -138,27 +307,64 @@ buildPythonPackage rec {
ray
sentencepiece
tiktoken
tokenizers
msgspec
gguf
einops
importlib-metadata
partial-json-parser
compressed-tensors
mistral-common
torch
torchaudio
torchvision
transformers
uvicorn
xformers
xgrammar
]
++ uvicorn.optional-dependencies.standard
++ aioprometheus.optional-dependencies.starlette
++ lib.optionals cudaSupport [
cupy
pynvml
];
dontUseCmakeConfigure = true;
cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
cmakeFlags =
[
(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}")
(lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}")
]
++ lib.optionals cudaSupport [
(lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}")
(lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.cudaFlags.cmakeCudaArchitecturesString
}")
(lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin {
name = "cuda-merged-${cudaPackages.cudaVersion}";
paths = builtins.concatMap getAllOutputs mergedCudaLibraries;
}}")
(lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON")
(lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON")
(lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON")
]
++ lib.optionals cpuSupport [
(lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}")
];
env =
lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
lib.optionalAttrs cudaSupport {
VLLM_TARGET_DEVICE = "cuda";
CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
}
// lib.optionalAttrs rocmSupport {
VLLM_TARGET_DEVICE = "rocm";
# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
ROCM_HOME = "${rocmPackages.clr}";
}
// lib.optionalAttrs cpuSupport {
VLLM_TARGET_DEVICE = "cpu";
};
pythonRelaxDeps = true;
@ -174,8 +380,8 @@ buildPythonPackage rec {
happysalada
lach
];
# RuntimeError: Unknown runtime environment
broken = true;
# broken = !cudaSupport && !rocmSupport;
# CPU support relies on unpackaged dependency `intel_extension_for_pytorch`
broken = cpuSupport;
};
}

View File

@ -0,0 +1,77 @@
{
lib,
stdenv,
buildPythonPackage,
fetchPypi,
python,
pythonOlder,
pythonAtLeast,
pydantic,
sentencepiece,
tiktoken,
torch,
transformers,
triton,
}:
let
pyShortVersion = "cp" + builtins.replaceStrings [ "." ] [ "" ] python.pythonVersion;
platforms = rec {
aarch64-darwin = "macosx_13_0_arm64";
x86_64-darwin = "macosx_10_15_x86_64";
x86_64-linux = "manylinux_2_27_x86_64.manylinux_2_28_x86_64";
};
platform = platforms.${stdenv.system} or (throw "Unsupported system: ${stdenv.system}");
# hashes retrieved via the following command
# curl https://pypi.org/pypi/xgrammar/${version}/json | jq -r '.urls[] | "\(.digests.sha256) \(.filename)"'
hashes = rec {
cp39-aarch64-darwin = "12dd579a7073c14981e01aeee566d20e60001bf90af23024e0e6692a770ff535";
cp39-x86_64-darwin = "035ec93306543b99bf2141dcc7f1a6dd0c255753fc8b5a2b5f3289a59fed8e37";
cp39-x86_64-linux = "3b3975dcf4b3ed7b16bbe3c068738b09847f841793e1c5e1b4a07dff36bbdc37";
cp310-aarch64-darwin = "93bb6c10cbdf1a2bda3b458d97b47436657d780f98dccf3d266e17e13568c0a9";
cp310-x86_64-darwin = "5ed31db2669dc499d9d29bb16f30b3395332ff9d0fb80b759697190a5ef5258b";
cp310-x86_64-linux = "9c6f571121e4af45e3b5dc55f3dadd751cffff1f85f1c6fc5c4276db2bbed222";
cp311-aarch64-darwin = "b293443725eddad31cf7b407bb24d5f3156c4b12a2c8041743cb7068a69fadcb";
cp311-x86_64-darwin = "b2106bceb2ce313628af915f2c2b1c9865612026dd3c9feddbfcc69e4ee6c971";
cp311-x86_64-linux = "7934c968371d55759cac35be3b218cdf4b13f323f535ea0faa233240bab803b9";
cp312-aarch64-darwin = "561f8d4307db8cf5d3c3b3ff46eda6d95379f6e801278dbf9153a9d5e8b6126c";
cp312-x86_64-darwin = "6ac3cbb0a82a3a9d07f0739f63b2e26cbef7855149d236057dcc7fee74b37970";
cp312-x86_64-linux = "1854d0fe6b908a3d2d42251a62e627224dbf6035a4322b844b1b5a277e3d0461";
};
hash =
hashes."${pyShortVersion}-${stdenv.system}"
or (throw "Unsupported Python version: ${python.pythonVersion}");
in
buildPythonPackage rec {
pname = "xgrammar";
version = "0.1.11";
format = "wheel";
disabled = pythonOlder "3.9" || pythonAtLeast "3.13";
src = fetchPypi {
inherit pname version format;
dist = pyShortVersion;
python = pyShortVersion;
abi = pyShortVersion;
platform = platform;
sha256 = hash;
};
pythonImportCheck = [ "xgrammar" ];
dependencies = [
pydantic
sentencepiece
tiktoken
torch
transformers
triton
];
meta = with lib; {
description = "Efficient, Flexible and Portable Structured Generation";
homepage = "https://xgrammar.mlc.ai";
license = licenses.asl20;
};
}

View File

@ -10744,6 +10744,8 @@ with pkgs;
openexr = openexr_3;
};
vllm = with python3Packages; toPythonApplication vllm;
vte-gtk4 = vte.override {
gtkVersion = "4";
};

View File

@ -18365,6 +18365,8 @@ self: super: with self; {
inherit (pkgs) xgboost;
};
xgrammar = callPackage ../development/python-modules/xgrammar { };
xhtml2pdf = callPackage ../development/python-modules/xhtml2pdf { };
xiaomi-ble = callPackage ../development/python-modules/xiaomi-ble { };