feat: enable deepgemm jit for fp8 block-scale on SM90 (#1969)

djmmoss · web-flow · commit bf03ad450d23 · 2025-10-25T23:26:36.000-07:00
## 📌 Description Enable JIT compile for the FP8 DeepGEMM kernels, NVRTC is currently disabled it uses NVCC by default. ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.).  ## Summary by CodeRabbit * **Refactor** * JIT include directory discovery now uses the flashinfer-python package instead of the previous package. * Updated resolved include path to the flashinfer data location. * Runtime compilation now consistently uses NVCC; the prior environment-variable toggle was removed. * Updated warning text when the expected package installation cannot be found.  --------- Signed-off-by: Duncan Moss <djm.moss@gmail.com>
diff --git a/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh b/csrc/nv_internal/tensorrt_llm/deep_gemm/compiler.cuh
@@ -125,7 +125,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs() {
   static std::vector<std::filesystem::path> includeDirs;
   if (includeDirs.empty()) {
     // Command to execute
-    char const* cmd = "pip show tensorrt_llm 2>/dev/null";
+    char const* cmd = "pip show flashinfer-python 2>/dev/null";
 
     // Buffer to store the output
     std::array<char, 128> buffer;
@@ -174,15 +174,11 @@ std::vector<std::filesystem::path> getJitIncludeDirs() {
         location.erase(location.find_last_not_of(" \n\r\t") + 1);
 
         // Set the include directory based on the package location
-        includeDirs.push_back(std::filesystem::path(location) / "tensorrt_llm" / "include");
-
-        if (!kJitUseNvcc) {
-          includeDirs.push_back(std::filesystem::path(location) / "tensorrt_llm" / "include" /
-                                "cuda" / "include");
-        }
+        includeDirs.push_back(std::filesystem::path(location) / "flashinfer" / "data" / "csrc" /
+                              "nv_internal" / "tensorrt_llm");
       }
     } else {
-      TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
+      TLLM_LOG_WARNING("Failed to find FlashInfer installation, DeepGEMM will be disabled.");
     }
   }
   return includeDirs;
diff --git a/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh b/csrc/nv_internal/tensorrt_llm/deep_gemm/runtime.cuh
@@ -36,8 +36,13 @@ static bool kJitDebugging = []() {
 }();
 
 static bool kJitUseNvcc = []() {
-  char const* env_var = getenv("TRTLLM_DG_JIT_USE_NVCC");
-  return env_var && (std::string(env_var) == "1" || std::string(env_var) == "true");
+  // char const* env_var = getenv("TRTLLM_DG_JIT_USE_NVCC");
+  // return env_var && (std::string(env_var) == "1" || std::string(env_var) == "true");
+  // always use nvcc
+  // TODO: Enable nvrtc -- need these headers:
+  // [TensorRT-LLM][INFO] Compilation log:
+  // kernel.cu(16): catastrophic error: cannot open source file "cuda_bf16.h"
+  return true;
 }();
 
 static bool kJitDumpCubin = []() {