remove debug flag (#3618)

lanluo-nvidia · web-flow · commit 56e6867e312e · 2025-06-25T14:24:59.000-07:00
diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py
@@ -107,7 +107,6 @@ def forward_loop(mod):
         "enabled_precisions": enabled_precisions,
         "truncate_double": True,
         "min_block_size": 1,
-        "debug": False,
         "use_python_runtime": True,
         "immutable_weights": False,
         "offload_module_to_cpu": True,
diff --git a/examples/distributed_inference/data_parallel_gpt2.py b/examples/distributed_inference/data_parallel_gpt2.py
@@ -15,11 +15,10 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 import torch
+import torch_tensorrt
 from accelerate import PartialState
 from transformers import AutoTokenizer, GPT2LMHeadModel
 
-import torch_tensorrt
-
 tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
 # Set input prompts for different devices
@@ -42,7 +41,6 @@
     options={
         "truncate_long_and_double": True,
         "enabled_precisions": {torch.float16},
-        "debug": True,
     },
     dynamic=False,
 )
diff --git a/examples/distributed_inference/data_parallel_stable_diffusion.py b/examples/distributed_inference/data_parallel_stable_diffusion.py
@@ -14,11 +14,10 @@
 # Imports and Model Definition
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 import torch
+import torch_tensorrt
 from accelerate import PartialState
 from diffusers import DiffusionPipeline
 
-import torch_tensorrt
-
 model_id = "CompVis/stable-diffusion-v1-4"
 
 # Instantiate Stable Diffusion Pipeline with FP16 weights
@@ -41,7 +40,6 @@
     options={
         "truncate_long_and_double": True,
         "precision": torch.float16,
-        "debug": True,
         "use_python_runtime": True,
     },
     dynamic=False,
diff --git a/examples/dynamo/aot_plugin.py b/examples/dynamo/aot_plugin.py
@@ -163,7 +163,6 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
         model_trt = torch_tensorrt.compile(
             my_model,
             inputs=trt_inputs,
-            debug=True,
             min_block_size=1,
         )
         print("Model compiled successfully!")
diff --git a/examples/dynamo/auto_generate_converters.py b/examples/dynamo/auto_generate_converters.py
@@ -174,9 +174,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 n = torch.full((64, 64), 3, device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
-    model_trt = torch_tensorrt.compile(
-        my_model, inputs=[m, n], debug=True, min_block_size=1
-    )
+    model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
     for i in range(300):
         res = model_trt(m, n)
         assert torch.allclose(res, my_model(m, n))
diff --git a/examples/dynamo/auto_generate_plugins.py b/examples/dynamo/auto_generate_plugins.py
@@ -144,9 +144,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 n = torch.randint(0, 5, (64, 64), device="cuda", dtype=torch.float)
 
 with torch_tensorrt.logging.errors():
-    model_trt = torch_tensorrt.compile(
-        my_model, inputs=[m, n], debug=True, min_block_size=1
-    )
+    model_trt = torch_tensorrt.compile(my_model, inputs=[m, n], min_block_size=1)
     for i in range(300):
         res = model_trt(m, n)
         assert torch.allclose(res, my_model(m, n))
diff --git a/examples/dynamo/cross_runtime_compilation_for_windows.py b/examples/dynamo/cross_runtime_compilation_for_windows.py
@@ -71,7 +71,6 @@
             "cross runtime compiled model for windows can only be compiled in Linux system"
         )
     compile_spec = {
-        "debug": True,
         "min_block_size": 1,
     }
     torchtrt.cross_compile_for_windows(
diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py
@@ -276,7 +276,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 #       Node: torch.ops.torchtrt_ex.triton_circular_pad.default, with layer location: __/triton_circular_pad
 #       Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner
 #
-#       Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
+#       Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
 #
 #         Graph Structure:
 #
@@ -581,7 +581,7 @@ def circular_padding_converter(
 #
 #       The graph consists of 2 Total Operators, of which 2 operators are supported, 100.0% coverage
 #
-#       Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
+#       Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
 #
 #         Graph Structure:
 #
diff --git a/examples/dynamo/engine_caching_bert_example.py b/examples/dynamo/engine_caching_bert_example.py
@@ -50,7 +50,6 @@ def compile_bert(iterations=3):
             "use_python_runtime": False,
             "enabled_precisions": {torch.float},
             "truncate_double": True,
-            "debug": False,
             "min_block_size": 1,
             "immutable_weights": False,
             "cache_built_engines": cache_built_engines,
diff --git a/examples/dynamo/engine_caching_example.py b/examples/dynamo/engine_caching_example.py
@@ -39,7 +39,6 @@
 
 model = models.resnet18(pretrained=True).eval().to("cuda")
 enabled_precisions = {torch.float}
-debug = False
 min_block_size = 1
 use_python_runtime = False
 
@@ -95,7 +94,6 @@ def torch_compile(iterations=3):
             options={
                 "use_python_runtime": True,
                 "enabled_precisions": enabled_precisions,
-                "debug": debug,
                 "min_block_size": min_block_size,
                 "immutable_weights": False,
                 "cache_built_engines": cache_built_engines,
@@ -155,7 +153,6 @@ def dynamo_compile(iterations=3):
             tuple(inputs),
             use_python_runtime=use_python_runtime,
             enabled_precisions=enabled_precisions,
-            debug=debug,
             min_block_size=min_block_size,
             immutable_weights=False,
             cache_built_engines=cache_built_engines,
@@ -266,7 +263,6 @@ def torch_compile_my_cache(iterations=3):
             options={
                 "use_python_runtime": True,
                 "enabled_precisions": enabled_precisions,
-                "debug": debug,
                 "min_block_size": min_block_size,
                 "immutable_weights": False,
                 "cache_built_engines": cache_built_engines,
diff --git a/examples/dynamo/llama2_flashinfer_rmsnorm.py b/examples/dynamo/llama2_flashinfer_rmsnorm.py
@@ -249,7 +249,6 @@ def replace_rmsnorm(
         disable_tf32=True,
         use_explicit_typing=False,
         use_fp32_acc=True,
-        # debug=True,
     )
 
 input_ids = input_ids.to(DEVICE)
diff --git a/examples/dynamo/mutable_torchtrt_module_example.py b/examples/dynamo/mutable_torchtrt_module_example.py
@@ -78,7 +78,6 @@
     settings = {
         "use_python_runtime": True,
         "enabled_precisions": {torch.float16},
-        "debug": False,
         "immutable_weights": False,
     }
 
@@ -180,7 +179,7 @@ def forward(self, a, b, c={}):
     },  # a's shape does not change so we give it an empty dict
 }
 # Export the model first with custom dynamic shape constraints
-model = torch_trt.MutableTorchTensorRTModule(model, debug=True, min_block_size=1)
+model = torch_trt.MutableTorchTensorRTModule(model, min_block_size=1)
 model.set_expected_dynamic_shape_range(args_dynamic_shapes, kwarg_dynamic_shapes)
 # Compile
 model(*inputs, **kwargs)
@@ -211,7 +210,6 @@ def forward(self, a, b, c={}):
     model,
     use_python_runtime=True,
     enabled_precisions={torch.float},
-    debug=True,
     min_block_size=1,
     immutable_weights=False,
     cache_built_engines=True,
diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py
@@ -56,7 +56,6 @@
 model = models.resnet18(pretrained=False).eval().to("cuda")
 exp_program = torch.export.export(model, tuple(inputs))
 enabled_precisions = {torch.float}
-debug = False
 workspace_size = 20 << 30
 min_block_size = 0
 use_python_runtime = False
@@ -66,7 +65,6 @@
     tuple(inputs),
     use_python_runtime=use_python_runtime,
     enabled_precisions=enabled_precisions,
-    debug=debug,
     min_block_size=min_block_size,
     torch_executed_ops=torch_executed_ops,
     immutable_weights=False,
diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py
@@ -73,7 +73,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 # py/torch_tensorrt/dynamo/_settings.py
 backend_kwargs = {
     "enabled_precisions": {torch.half},
-    "debug": True,
     "min_block_size": 2,
     "torch_executed_ops": {"torch.ops.aten.sub.Tensor"},
     "optimization_level": 4,
diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py
@@ -28,8 +28,6 @@
 # Enabled precision for TensorRT optimization
 enabled_precisions = {torch.half}
 
-# Whether to print verbose logs
-debug = True
 
 # Workspace size for TensorRT
 workspace_size = 20 << 30
@@ -51,7 +49,6 @@
     ir="torch_compile",
     inputs=inputs,
     enabled_precisions=enabled_precisions,
-    debug=debug,
     workspace_size=workspace_size,
     min_block_size=min_block_size,
     torch_executed_ops=torch_executed_ops,
@@ -88,7 +85,6 @@
     ir="torch_compile",
     inputs=inputs_bs8,
     enabled_precisions=enabled_precisions,
-    debug=debug,
     workspace_size=workspace_size,
     min_block_size=min_block_size,
     torch_executed_ops=torch_executed_ops,
diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py
@@ -32,9 +32,6 @@
 # Enabled precision for TensorRT optimization
 enabled_precisions = {torch.float}
 
-# Whether to print verbose logs
-debug = True
-
 # Workspace size for TensorRT
 workspace_size = 20 << 30
 
@@ -52,7 +49,6 @@
 # Define backend compilation keyword arguments
 compilation_kwargs = {
     "enabled_precisions": enabled_precisions,
-    "debug": debug,
     "workspace_size": workspace_size,
     "min_block_size": min_block_size,
     "torch_executed_ops": torch_executed_ops,
diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py
@@ -244,7 +244,6 @@ def calibrate_loop(model):
             inputs=[input_tensor],
             enabled_precisions=enabled_precisions,
             min_block_size=1,
-            debug=False,
         )
         # You can also use torch compile path to compile the model with Torch-TensorRT:
         # trt_model = torch.compile(model, backend="tensorrt")
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
@@ -236,7 +236,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
                 tuple(inputs),
                 use_python_runtime=True,
                 enabled_precisions={torch.float},
-                debug=False,
                 min_block_size=1,
                 immutable_weights=False,
                 cache_built_engines=cache_built_engines,
@@ -307,7 +306,6 @@ def test_dynamo_compile_with_custom_engine_cache(self):
                 tuple(inputs),
                 use_python_runtime=True,
                 enabled_precisions={torch.float},
-                debug=False,
                 min_block_size=1,
                 immutable_weights=False,
                 cache_built_engines=cache_built_engines,
@@ -361,7 +359,6 @@ def test_dynamo_compile_change_input_shape(self):
                 inputs=inputs,
                 use_python_runtime=False,
                 enabled_precisions={torch.float},
-                debug=False,
                 min_block_size=1,
                 immutable_weights=False,
                 cache_built_engines=True,
@@ -682,7 +679,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
             inputs,
             use_python_runtime=True,
             enabled_precisions={torch.float},
-            debug=False,
             min_block_size=1,
             immutable_weights=False,
             cache_built_engines=False,
@@ -735,7 +731,6 @@ def remove_timing_cache(path=TIMING_CACHE_PATH):
                 tuple(inputs),
                 use_python_runtime=True,
                 enabled_precisions={torch.float},
-                debug=False,
                 min_block_size=1,
                 cache_built_engines=cache_built_engines,
                 reuse_cached_engines=reuse_cached_engines,
@@ -913,7 +908,6 @@ def remove_timing_cache(path=timing_cache_path):
             inputs=[input_ids],
             use_python_runtime=True,
             enabled_precisions={torch.float32},
-            debug=False,
             min_block_size=1,
             immutable_weights=False,
             truncate_double=True,
@@ -967,7 +961,6 @@ def remove_timing_cache(path=timing_cache_path):
                 inputs=[input_ids],
                 use_python_runtime=True,
                 enabled_precisions={torch.float32},
-                debug=False,
                 min_block_size=1,
                 truncate_double=True,
                 device=DEVICE,
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
diff --git a/tests/py/dynamo/models/test_models.py b/tests/py/dynamo/models/test_models.py
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
diff --git a/tests/py/dynamo/models/test_weight_stripped_engine.py b/tests/py/dynamo/models/test_weight_stripped_engine.py
diff --git a/tests/py/dynamo/runtime/test_002_cudagraphs_py.py b/tests/py/dynamo/runtime/test_002_cudagraphs_py.py
diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py

Original file line number	Diff line number	Diff line change
`@@ -163,7 +163,6 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:`
`163`	`163`	`model_trt = torch_tensorrt.compile(`
`164`	`164`	`my_model,`
`165`	`165`	`inputs=trt_inputs,`
`166`		`- debug=True,`
`167`	`166`	`min_block_size=1,`
`168`	`167`	`)`
`169`	`168`	`print("Model compiled successfully!")`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,6 @@`
`71`	`71`	`"cross runtime compiled model for windows can only be compiled in Linux system"`
`72`	`72`	`)`
`73`	`73`	`compile_spec = {`
`74`		`- "debug": True,`
`75`	`74`	`"min_block_size": 1,`
`76`	`75`	`}`
`77`	`76`	`torchtrt.cross_compile_for_windows(`
Original file line number	Diff line number	Diff line change
`@@ -276,7 +276,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:`
`276`	`276`	`# Node: torch.ops.torchtrt_ex.triton_circular_pad.default, with layer location: __/triton_circular_pad`
`277`	`277`	`# Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner`
`278`	`278`	`#`
`279`		-# Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
	`279`	+# Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=True, hardware_compatible=False)
`280`	`280`	`#`
`281`	`281`	`# Graph Structure:`
`282`	`282`	`#`
`@@ -581,7 +581,7 @@ def circular_padding_converter(`
`581`	`581`	`#`
`582`	`582`	`# The graph consists of 2 Total Operators, of which 2 operators are supported, 100.0% coverage`
`583`	`583`	`#`
`584`		-# Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, debug=True, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
	`584`	+# Compiled with: CompilationSettings(enabled_precisions={<dtype.f32: 7>}, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False)
`585`	`585`	`#`
`586`	`586`	`# Graph Structure:`
`587`	`587`	`#`