upd

sgl-project · Dec 31, 2024 · b4fdb37 · b4fdb37
1 parent 7ca0b2e
commit b4fdb37
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 0 deletions.
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -57,6 +57,7 @@
     monkey_patch_vllm_gguf_config,
     monkey_patch_vllm_p2p_access_check,
     set_cpu_offload_max_bytes,
+    set_cuda_arch,
 )
 
 logger = logging.getLogger(__name__)
@@ -245,6 +246,8 @@ def load_model(self):
                 if torch.cuda.get_device_capability()[1] < 5:
                     raise RuntimeError("SGLang only supports sm75 and above.")
 
+        set_cuda_arch()
+
         # Prepare the model config
         self.load_config = LoadConfig(
             load_format=self.server_args.load_format,

diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
@@ -96,6 +96,13 @@ def is_flashinfer_available():
     return torch.cuda.is_available() and torch.version.cuda
 
 
+def set_cuda_arch():
+    if is_flashinfer_available():
+        capability = torch.cuda.get_device_capability()
+        arch = f"{capability[0]}.{capability[1]}"
+        os.environ["TORCH_CUDA_ARCH_LIST"] = f"{arch}{'+PTX' if arch == '9.0' else ''}"
+
+
 def is_ipv6(address):
     try:
         ipaddress.IPv6Address(address)