From ae576d530ff964619d41fbc2e8ac473d410eca8d Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Fri, 26 Sep 2025 16:18:33 +0800
Subject: [PATCH 1/7] clean flashinfer cache for scalefolding case

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/test_e2e.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 3defa73aac2..9dfe7d6ce57 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -829,6 +829,20 @@ def get_tmp_file():
     return tempfile.mkstemp()
 
 
+@pytest.fixture
+def clear_flashinfer_cache():
+    """Clear flashinfer cache directory before test execution"""
+    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
+    yield
+    # Cleanup after test if needed
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
+
+
 @pytest.fixture
 def temp_extra_llm_api_options_file(request):
     if request.node.callspec.params['use_extra_config']:
@@ -3352,7 +3366,7 @@ def test_ptp_star_attention_example(llm_root, llm_venv, model_name, model_path,
 @pytest.mark.parametrize("model_name,model_path", [
     ("DeepSeek-R1-Distill-Qwen-7B", "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B"),
 ])
-def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path):
+def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path, clear_flashinfer_cache):
     print(f"Testing scaffolding {model_name}.")
     example_root = Path(os.path.join(llm_root, "examples", "scaffolding"))
     input_file = Path(os.path.join(example_root, "test.jsonl"))

From 7dd042948567c4ec3a9a4dd9f3a5c21d2ea9ec73 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Fri, 26 Sep 2025 16:20:57 +0800
Subject: [PATCH 2/7] add clean cache on long context

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/examples/test_llama.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
index 60505a4d56c..996329d1c4d 100644
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@@ -20,6 +20,7 @@
 import shutil
 import subprocess
 from copy import deepcopy
+from pathlib import Path
 
 import defs.ci_profiler
 import pytest
@@ -63,6 +64,20 @@
 INPUT_TEXT_2 = "Born in north-east France, Soyer trained as a"
 
 
+@pytest.fixture
+def clear_flashinfer_cache():
+    """Clear flashinfer cache directory before test execution"""
+    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
+    yield
+    # Cleanup after test if needed
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
+
+
 @pytest.mark.parametrize("num_beams", [5, 7],
                          ids=["num_beams_4", "num_beams_7"])
 @pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
@@ -2964,7 +2979,7 @@ def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
                                                 llama_model_root, llm_venv,
                                                 engine_dir, cmodel_dir,
                                                 llm_datasets_root,
-                                                dataset_name):
+                                                dataset_name, clear_flashinfer_cache):
     "Build & run llama-3-8B-1048k on long context ppl."
     if dataset_name == "SlimPajama-6B" and get_device_memory() < 50000:
         pytest.skip("GPU memory is insufficient.")

From ead40047efdbf57f7547deec790f12157f39ae08 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Tue, 21 Oct 2025 14:14:39 +0800
Subject: [PATCH 3/7] unify

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/conftest.py            | 15 +++++++++++++++
 tests/integration/defs/examples/test_llama.py | 17 +----------------
 tests/integration/defs/test_e2e.py            | 17 ++---------------
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index 333941b57cf..fa90a874cf2 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2563,3 +2563,18 @@ def torch_empty_cache() -> None:
     """
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+
+
+@pytest.fixture
+def clear_flashinfer_cache():
+    """Clear flashinfer cache directory before test execution"""
+    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
+    yield
+    # Cleanup after test if needed
+    if flashinfer_cache_path.exists():
+        shutil.rmtree(flashinfer_cache_path)
+        print(
+            f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
index 996329d1c4d..60505a4d56c 100644
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@@ -20,7 +20,6 @@
 import shutil
 import subprocess
 from copy import deepcopy
-from pathlib import Path
 
 import defs.ci_profiler
 import pytest
@@ -64,20 +63,6 @@
 INPUT_TEXT_2 = "Born in north-east France, Soyer trained as a"
 
 
-@pytest.fixture
-def clear_flashinfer_cache():
-    """Clear flashinfer cache directory before test execution"""
-    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
-    yield
-    # Cleanup after test if needed
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
-
-
 @pytest.mark.parametrize("num_beams", [5, 7],
                          ids=["num_beams_4", "num_beams_7"])
 @pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
@@ -2979,7 +2964,7 @@ def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
                                                 llama_model_root, llm_venv,
                                                 engine_dir, cmodel_dir,
                                                 llm_datasets_root,
-                                                dataset_name, clear_flashinfer_cache):
+                                                dataset_name):
     "Build & run llama-3-8B-1048k on long context ppl."
     if dataset_name == "SlimPajama-6B" and get_device_memory() < 50000:
         pytest.skip("GPU memory is insufficient.")
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 9dfe7d6ce57..14b13728587 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -829,20 +829,6 @@ def get_tmp_file():
     return tempfile.mkstemp()
 
 
-@pytest.fixture
-def clear_flashinfer_cache():
-    """Clear flashinfer cache directory before test execution"""
-    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
-    yield
-    # Cleanup after test if needed
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
-
-
 @pytest.fixture
 def temp_extra_llm_api_options_file(request):
     if request.node.callspec.params['use_extra_config']:
@@ -3366,7 +3352,8 @@ def test_ptp_star_attention_example(llm_root, llm_venv, model_name, model_path,
 @pytest.mark.parametrize("model_name,model_path", [
     ("DeepSeek-R1-Distill-Qwen-7B", "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B"),
 ])
-def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path, clear_flashinfer_cache):
+def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path,
+                         clear_flashinfer_cache):
     print(f"Testing scaffolding {model_name}.")
     example_root = Path(os.path.join(llm_root, "examples", "scaffolding"))
     input_file = Path(os.path.join(example_root, "test.jsonl"))

From 8ab6c730338d7df3e7746519b04c194407ebeeec Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:29:20 +0800
Subject: [PATCH 4/7] update test config

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/test_e2e.py                           | 4 ++--
 tests/integration/test_lists/qa/llm_function_core.txt        | 2 +-
 tests/integration/test_lists/qa/llm_function_core_sanity.txt | 2 +-
 tests/integration/test_lists/qa/llm_function_nim.txt         | 2 +-
 tests/integration/test_lists/waives.txt                      | 1 -
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 14b13728587..6d2b1523bb0 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2262,7 +2262,7 @@ def test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus(
 
 @pytest.mark.skip_less_device_memory(80000)
 @pytest.mark.parametrize("model_name,model_path,gpu_count", [
-    ("Llama3.1-70B-BF16", "llama-3.1-model/Meta-Llama-3.1-70B", 2),
+    ("Llama3.1-70B-BF16", "llama-3.1-model/Meta-Llama-3.1-70B", 8),
     ("Mixtral-8x7B-BF16", "Mixtral-8x7B-v0.1", 8),
     pytest.param('Llama3.1-70B-FP8',
                  'llama-3.1-model/Llama-3.1-70B-Instruct-FP8',
@@ -2293,7 +2293,7 @@ def test_ptp_quickstart_advanced_multi_gpus(llm_root, llm_venv, model_name,
         pytest.skip(f"Not enough GPUs for {model_name}")
     example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
     mapping = {
-        "Llama3.1-70B-BF16": 91.0,
+        "Llama3.1-70B-BF16": 21.0,
         "Mixtral-8x7B-BF16": 16.5,
         "Llama3.1-70B-FP8": 58.5,
         "Llama3.1-405B-FP8": 63.2,
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 0d3985d4fde..d851cae5432 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -634,7 +634,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
 test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
 test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
-test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2]
+test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8]
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
index 4ad8bea44c1..fd8b9cc979f 100644
--- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -210,7 +210,7 @@ test_e2e.py::test_openai_consistent_chat
 test_e2e.py::test_openai_multi_chat_example
 test_e2e.py::test_ptp_quickstart
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8]
-test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2]
+test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1-8]
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
index 5b6f9b72639..eac2254214b 100644
--- a/tests/integration/test_lists/qa/llm_function_nim.txt
+++ b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -418,7 +418,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
 test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
-test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-2]
+test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8-2]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8-8]
 test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1-8]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 4e2bb48655d..640ff64855e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -313,7 +313,6 @@ full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8
 full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5512734)
 full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] SKIP (https://nvbugs/5483534)
 full:A100/test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] SKIP (https://nvbugs/5453725)
-test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5517260)
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5509024)
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False] SKIP (https://nvbugs/5509024)
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True] SKIP (https://nvbugs/5509024)

From c98a1ad096962cf8744bd9116f2e9e5f6c3b32f8 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Wed, 22 Oct 2025 11:43:58 +0800
Subject: [PATCH 5/7] update waive list

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/test_e2e.py      | 2 +-
 tests/integration/test_lists/waives.txt | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 6d2b1523bb0..45e4b0dcd4f 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2293,7 +2293,7 @@ def test_ptp_quickstart_advanced_multi_gpus(llm_root, llm_venv, model_name,
         pytest.skip(f"Not enough GPUs for {model_name}")
     example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
     mapping = {
-        "Llama3.1-70B-BF16": 21.0,
+        "Llama3.1-70B-BF16": 24.6,
         "Mixtral-8x7B-BF16": 16.5,
         "Llama3.1-70B-FP8": 58.5,
         "Llama3.1-405B-FP8": 63.2,
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 640ff64855e..6286626159b 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -346,3 +346,6 @@ triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-ten
 cpp/test_e2e.py::test_benchmarks[gpt-80] SKIP (https://nvbugs/5601670)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5587574)
+full:H20-3e/accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (slow I/O)
+full:H20-3e/accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] SKIP (slow I/O)
+full:H20-3e/test_e2e.py::test_ptp_quickstart_advanced_multi_gpus[DeepSeek-V3-671B-FP8-DeepSeek-V3-0324-8] SKIP (slow I/O)

From 87383e620bea711eb6873f370d74d812d29f84be Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Wed, 22 Oct 2025 12:59:22 +0800
Subject: [PATCH 6/7] cherry-pick main conftest fix

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index fa90a874cf2..d7084775253 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -15,6 +15,7 @@
 # -*- coding: utf-8 -*-
 
 import datetime
+import gc
 import os
 import platform
 import re
@@ -2562,6 +2563,7 @@ def torch_empty_cache() -> None:
     Manually empty the torch CUDA cache before each test, to reduce risk of OOM errors.
     """
     if torch.cuda.is_available():
+        gc.collect()
         torch.cuda.empty_cache()
 
 

From a46b18ece945834892f9cc93df8ef38e349cb928 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Fri, 24 Oct 2025 12:23:20 +0800
Subject: [PATCH 7/7] remove clear flashinfer cache

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/conftest.py | 15 ---------------
 tests/integration/defs/test_e2e.py |  3 +--
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index d7084775253..0962c24b636 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2565,18 +2565,3 @@ def torch_empty_cache() -> None:
     if torch.cuda.is_available():
         gc.collect()
         torch.cuda.empty_cache()
-
-
-@pytest.fixture
-def clear_flashinfer_cache():
-    """Clear flashinfer cache directory before test execution"""
-    flashinfer_cache_path = Path.home() / ".cache" / "flashinfer"
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(f"Cleared flashinfer cache: {flashinfer_cache_path}")
-    yield
-    # Cleanup after test if needed
-    if flashinfer_cache_path.exists():
-        shutil.rmtree(flashinfer_cache_path)
-        print(
-            f"Cleaned up flashinfer cache after test: {flashinfer_cache_path}")
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 45e4b0dcd4f..ad4f3e6a621 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -3352,8 +3352,7 @@ def test_ptp_star_attention_example(llm_root, llm_venv, model_name, model_path,
 @pytest.mark.parametrize("model_name,model_path", [
     ("DeepSeek-R1-Distill-Qwen-7B", "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B"),
 ])
-def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path,
-                         clear_flashinfer_cache):
+def test_ptp_scaffolding(llm_root, llm_venv, model_name, model_path):
     print(f"Testing scaffolding {model_name}.")
     example_root = Path(os.path.join(llm_root, "examples", "scaffolding"))
     input_file = Path(os.path.join(example_root, "test.jsonl"))