diff --git a/pkgs/development/python-modules/warp-lang/default.nix b/pkgs/development/python-modules/warp-lang/default.nix
index b544581e9055..20b91f411d74 100644
--- a/pkgs/development/python-modules/warp-lang/default.nix
+++ b/pkgs/development/python-modules/warp-lang/default.nix
@@ -1,20 +1,24 @@
 {
-  config,
-  lib,
-  stdenv,
+  autoAddDriverRunpath,
   buildPythonPackage,
-  fetchurl,
-  fetchFromGitHub,
-  replaceVars,
-  build,
-  setuptools,
-  numpy,
-  llvmPackages,
+  config,
   cudaPackages,
-  unittestCheckHook,
+  fetchFromGitHub,
+  fetchurl,
   jax,
+  lib,
+  llvmPackages,
+  numpy,
+  pkgsBuildHost,
+  python,
+  replaceVars,
+  runCommand,
+  setuptools,
+  stdenv,
   torch,
-  nix-update-script,
+  warp-lang, # Self-reference to this package for passthru.tests
+  writableTmpDirAsHomeHook,
+  writeShellApplication,
 
   # Use standalone LLVM-based JIT compiler and CPU device support
   standaloneSupport ? true,
@@ -25,63 +29,69 @@
   # Build Warp with MathDx support (requires CUDA support)
   # Most linear-algebra tile operations like tile_cholesky(), tile_fft(),
   # and tile_matmul() require Warp to be built with the MathDx library.
-  libmathdxSupport ? cudaSupport && stdenv.hostPlatform.isLinux,
-}:
-
+  # libmathdxSupport ? cudaSupport && stdenv.hostPlatform.isLinux,
+  libmathdxSupport ? cudaSupport,
+}@args:
+assert libmathdxSupport -> cudaSupport;
 let
+  effectiveStdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
+  stdenv = builtins.throw "Use effectiveStdenv instead of stdenv directly, as it may be replaced by cudaPackages.backendStdenv";
+
   version = "1.7.2.post1";
 
-  libmathdx = stdenv.mkDerivation (finalAttrs: {
+  libmathdx = effectiveStdenv.mkDerivation (finalAttrs: {
+    # NOTE: The version used should match the version Warp requires:
+    # https://github.com/NVIDIA/warp/blob/4ad209076ce09668b18dedc74dce0d5cf8b9e409/deps/libmathdx-deps.packman.xml
     pname = "libmathdx";
-    version = "0.2.0";
+    version = "0.1.2";
+
+    outputs = [
+      "out"
+      "static"
+    ];
 
     src =
       let
-        inherit (stdenv.hostPlatform) system;
-        selectSystem = attrs: attrs.${system} or (throw "Unsupported system: ${system}");
-
-        suffix = selectSystem {
-          x86_64-linux = "Linux-x86_64";
-          aarch64-linux = "Linux-aarch64";
-          x86_64-windows = "win32-x86_64";
-        };
-
-        # nix-hash --type sha256 --to-sri $(nix-prefetch-url "https://...")
-        hash = selectSystem {
-          x86_64-linux = "sha256-Lk+PxWFvyQGRClFdmyuo4y7HBdR7pigOhMyEzajqbmg=";
-          aarch64-linux = "sha256-6tH9YH98kSvDiut9rQEU5potEpeKqma/QtrCHLxwRLo=";
-          x86_64-windows = "sha256-B8qwj7UzOXEDZh2oT3ip1qW0uqtygMsyfcbhh5Dgc8U=";
+        baseURL = "https://developer.download.nvidia.com/compute/cublasdx/redist/cublasdx";
+        name = lib.concatStringsSep "-" [
+          finalAttrs.pname
+          "Linux"
+          effectiveStdenv.hostPlatform.parsed.cpu.name
+          finalAttrs.version
+        ];
+        hashes = {
+          aarch64-linux = "sha256-7HEXfzxPF62q/7pdZidj4eO09u588yxcpSu/bWot/9A=";
+          x86_64-linux = "sha256-MImBFv+ooRSUqdL/YEe/bJIcVBnHMCk7SLS5eSeh0cQ=";
         };
       in
-      fetchurl {
-        url = "https://developer.nvidia.com/downloads/compute/cublasdx/redist/cublasdx/libmathdx-${suffix}-${finalAttrs.version}.tar.gz";
-        inherit hash;
-      };
-
-    unpackPhase = ''
-      runHook preUnpack
-
-      mkdir unpacked
-      cd unpacked
-      tar -xzf $src
-      export sourceRoot=$(pwd)
-
-      runHook postUnpack
-    '';
+      lib.mapNullable (
+        hash:
+        fetchurl {
+          inherit hash name;
+          url = "${baseURL}/${name}.tar.gz";
+        }
+      ) (hashes.${effectiveStdenv.hostPlatform.system} or null);
 
+    dontUnpack = true;
     dontConfigure = true;
     dontBuild = true;
 
+    # NOTE: The leading component is stripped because the 0.1.2 release is within the `libmathdx` directory.
     installPhase = ''
       runHook preInstall
 
-      cp -rT "$sourceRoot" "$out"
+      mkdir -p "$out"
+      tar -xzf "$src" --strip-components=1 -C "$out"
+
+      mkdir -p "$static"
+      moveToOutput "lib/libmathdx_static.a" "$static"
 
       runHook postInstall
     '';
 
     meta = {
       description = "library used to integrate cuBLASDx and cuFFTDx into Warp";
+      homepage = "https://developer.nvidia.com/cublasdx-downloads";
       sourceProvenance = with lib.sourceTypes; [ binaryNativeCode ];
       license = with lib.licenses; [
         # By downloading and using the software, you agree to fully
@@ -104,7 +114,10 @@ let
         # license:
         mit
       ];
-      platforms = with lib.platforms; linux ++ [ "x86_64-windows" ];
+      platforms = [
+        "aarch64-linux"
+        "x86_64-linux"
+      ];
       maintainers = with lib.maintainers; [ yzx9 ];
     };
   });
@@ -114,6 +127,13 @@ buildPythonPackage {
   inherit version;
   pyproject = true;
 
+  # TODO(@connorbaker): Some CUDA setup hook is failing when __structuredAttrs is false,
+  # causing a bunch of missing math symbols (like expf) when linking against the static library
+  # provided by NVCC.
+  __structuredAttrs = true;
+
+  stdenv = effectiveStdenv;
+
   src = fetchFromGitHub {
     owner = "NVIDIA";
     repo = "warp";
@@ -122,7 +142,7 @@ buildPythonPackage {
   };
 
   patches =
-    lib.optionals stdenv.hostPlatform.isDarwin [
+    lib.optionals effectiveStdenv.hostPlatform.isDarwin [
       (replaceVars ./darwin-libcxx.patch {
         LIBCXX_DEV = llvmPackages.libcxx.dev;
         LIBCXX_LIB = llvmPackages.libcxx;
@@ -140,22 +160,69 @@ buildPythonPackage {
     ];
 
   postPatch =
-    lib.optionalString (!stdenv.cc.isGNU) ''
-      substituteInPlace warp/build_dll.py \
-        --replace-fail "g++" "${lib.getExe stdenv.cc}"
+    # Patch build_dll.py to use our gencode flags rather than NVIDIA's very broad defaults.
+    # NOTE: After 1.7.2, patching will need to be updated like this:
+    # https://github.com/ConnorBaker/cuda-packages/blob/2fc8ba8c37acee427a94cdd1def55c2ec701ad82/pkgs/development/python-modules/warp/default.nix#L56-L65
+    lib.optionalString cudaSupport ''
+      nixLog "patching $PWD/warp/build_dll.py to use our gencode flags"
+      substituteInPlace "$PWD/warp/build_dll.py" \
+        --replace-fail \
+          'nvcc_opts = gencode_opts + [' \
+          'nvcc_opts = [ ${
+            lib.concatMapStringsSep ", " (gencodeString: ''"${gencodeString}"'') cudaPackages.flags.gencode
+          }, '
+    ''
+    # Patch build_dll.py to use dynamic libraries rather than static ones.
+    # NOTE: We do not patch the `nvptxcompiler_static` path because it is not available as a dynamic library.
+    + lib.optionalString cudaSupport ''
+      nixLog "patching $PWD/warp/build_dll.py to use dynamic libraries"
+      substituteInPlace "$PWD/warp/build_dll.py" \
+        --replace-fail \
+          '-lcudart_static' \
+          '-lcudart' \
+        --replace-fail \
+          '-lnvrtc_static' \
+          '-lnvrtc' \
+        --replace-fail \
+          '-lnvrtc-builtins_static' \
+          '-lnvrtc-builtins' \
+        --replace-fail \
+          '-lnvJitLink_static' \
+          '-lnvJitLink' \
+        --replace-fail \
+          '-lmathdx_static' \
+          '-lmathdx'
+    ''
+    + ''
+      nixLog "patching $PWD/warp/build_dll.py to use our C++ compiler"
+      substituteInPlace "$PWD/warp/build_dll.py" \
+        --replace-fail "g++" "c++"
     ''
     # Broken tests on aarch64. Since unittest doesn't support disabling a
     # single test, and pytest isn't compatible, we patch the test file directly
     # instead.
     #
     # See: https://github.com/NVIDIA/warp/issues/552
-    + lib.optionalString stdenv.hostPlatform.isAarch64 ''
-      substituteInPlace warp/tests/test_fem.py \
-        --replace-fail "add_function_test(TestFem, \"test_integrate_gradient\", test_integrate_gradient, devices=devices)" ""
+    + lib.optionalString effectiveStdenv.hostPlatform.isAarch64 ''
+      nixLog "patching $PWD/warp/tests/test_fem.py to disable broken tests on aarch64"
+      substituteInPlace "$PWD/warp/tests/test_fem.py" \
+        --replace-fail \
+          'add_function_test(TestFem, "test_integrate_gradient", test_integrate_gradient, devices=devices)' \
+          ""
+    ''
+    # These tests fail on CPU and CUDA.
+    + ''
+      nixLog "patching $PWD/warp/tests/test_reload.py to disable broken tests"
+      substituteInPlace "$PWD/warp/tests/test_reload.py" \
+        --replace-fail \
+          'add_function_test(TestReload, "test_reload", test_reload, devices=devices)' \
+          "" \
+        --replace-fail \
+          'add_function_test(TestReload, "test_reload_references", test_reload_references, devices=get_test_devices("basic"))' \
+          ""
     '';
 
   build-system = [
-    build
     setuptools
   ];
 
@@ -163,11 +230,11 @@ buildPythonPackage {
     numpy
   ];
 
-  nativeBuildInputs = lib.optionals libmathdxSupport [
-    libmathdx
-    cudaPackages.libcublas
-    cudaPackages.libcufft
-    cudaPackages.libnvjitlink
+  # NOTE: While normally we wouldn't include autoAddDriverRunpath for packages built from source, since Warp
+  # will be loading GPU drivers at runtime, we need to inject the path to our video drivers.
+  nativeBuildInputs = lib.optionals cudaSupport [
+    autoAddDriverRunpath
+    cudaPackages.cuda_nvcc
   ];
 
   buildInputs =
@@ -177,10 +244,18 @@ buildPythonPackage {
       llvmPackages.libcxx
     ]
     ++ lib.optionals cudaSupport [
-      cudaPackages.cudatoolkit
+      (lib.getOutput "static" cudaPackages.cuda_nvcc) # dependency on nvptxcompiler_static; no dynamic version available
+      cudaPackages.cuda_cccl
       cudaPackages.cuda_cudart
       cudaPackages.cuda_nvcc
       cudaPackages.cuda_nvrtc
+    ]
+    ++ lib.optionals libmathdxSupport [
+      libmathdx
+      cudaPackages.libcublas
+      cudaPackages.libcufft
+      cudaPackages.libcusolver
+      cudaPackages.libnvjitlink
     ];
 
   preBuild =
@@ -190,7 +265,8 @@ buildPythonPackage {
           "--no_standalone"
         ]
         ++ lib.optionals cudaSupport [
-          "--cuda_path=${cudaPackages.cudatoolkit}"
+          # NOTE: The `cuda_path` argument is the directory which contains `bin/nvcc` (i.e., the bin output).
+          "--cuda_path=${lib.getBin pkgsBuildHost.cudaPackages.cuda_nvcc}"
         ]
         ++ lib.optionals libmathdxSupport [
           "--libmathdx"
@@ -203,34 +279,102 @@ buildPythonPackage {
       buildOptionString = lib.concatStringsSep " " buildOptions;
     in
     ''
-      python build_lib.py ${buildOptionString}
+      nixLog "running $PWD/build_lib.py to create components necessary to build the wheel"
+      "${python.pythonOnBuildForHost.interpreter}" "$PWD/build_lib.py" ${buildOptionString}
     '';
 
   pythonImportsCheck = [
     "warp"
   ];
 
-  # Many unit tests fail with segfaults on aarch64-linux, especially in the sim
-  # and grad modules. However, other functionality generally works, so we don't
-  # mark the package as broken.
-  #
-  # See: https://www.github.com/NVIDIA/warp/issues/{356,372,552}
-  doCheck = !(stdenv.hostPlatform.isAarch64 && stdenv.hostPlatform.isLinux);
+  # See passthru.tests.
+  doCheck = false;
 
-  nativeCheckInputs = [
-    unittestCheckHook
-    (jax.override { inherit cudaSupport; })
-    (torch.override { inherit cudaSupport; })
+  passthru = {
+    # Make libmathdx available for introspection.
+    inherit libmathdx;
 
-    # # Disable paddlepaddle interop tests: malloc(): unaligned tcache chunk detected
-    #  (paddlepaddle.override { inherit cudaSupport; })
-  ];
+    # Scripts which provide test packages and implement test logic.
+    testers.unit-tests = writeShellApplication {
+      name = "warp-lang-unit-tests";
+      runtimeInputs = [
+        # Use the references from args
+        (python.withPackages (_: [
+          warp-lang
+          jax
+          torch
+        ]))
+        # Disable paddlepaddle interop tests: malloc(): unaligned tcache chunk detected
+        #  (paddlepaddle.override { inherit cudaSupport; })
+      ];
+      text = ''
+        python3 -m warp.tests
+      '';
+    };
 
-  preCheck = ''
-    export WARP_CACHE_PATH=$(mktemp -d) # warp.config.kernel_cache_dir
-  '';
+    # Tests run within the Nix sandbox.
+    tests =
+      let
+        mkUnitTests =
+          {
+            cudaSupport,
+            libmathdxSupport,
+          }:
+          let
+            name =
+              "warp-lang-unit-tests-cpu" # CPU is baseline
+              + lib.optionalString cudaSupport "-cuda"
+              + lib.optionalString libmathdxSupport "-libmathdx";
 
-  passthru.updateScript = nix-update-script { };
+            warp-lang' = warp-lang.override {
+              inherit cudaSupport libmathdxSupport;
+              # Make sure the warp-lang provided through callPackage is replaced with the override we're making.
+              warp-lang = warp-lang';
+            };
+          in
+          runCommand name
+            {
+              nativeBuildInputs = [
+                warp-lang'.passthru.testers.unit-tests
+                writableTmpDirAsHomeHook
+              ];
+              requiredSystemFeatures = lib.optionals cudaSupport [ "cuda" ];
+              # Many unit tests fail with segfaults on aarch64-linux, especially in the sim
+              # and grad modules. However, other functionality generally works, so we don't
+              # mark the package as broken.
+              #
+              # See: https://www.github.com/NVIDIA/warp/issues/{356,372,552}
+              meta.broken = effectiveStdenv.hostPlatform.isAarch64 && effectiveStdenv.hostPlatform.isLinux;
+            }
+            ''
+              nixLog "running ${name}"
+
+              if warp-lang-unit-tests; then
+                nixLog "${name} passed"
+                touch "$out"
+              else
+                nixErrorLog "${name} failed"
+                exit 1
+              fi
+            '';
+      in
+      {
+        cpu = mkUnitTests {
+          cudaSupport = false;
+          libmathdxSupport = false;
+        };
+        cuda = {
+          cudaOnly = mkUnitTests {
+            cudaSupport = true;
+            libmathdxSupport = false;
+          };
+          cudaWithLibmathDx = mkUnitTests {
+            cudaSupport = true;
+            libmathdxSupport = true;
+          };
+        };
+      };
+  };
 
   meta = {
     description = "Python framework for high performance GPU simulation and graphics";