NVIDIA
diff --git a/‎modelopt/onnx/quantization/autotune/__init__.py‎
Lines changed: 96 additions & 0 deletions b/‎modelopt/onnx/quantization/autotune/__init__.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎modelopt/onnx/quantization/autotune/__main__.py‎
Lines changed: 78 additions & 0 deletions b/‎modelopt/onnx/quantization/autotune/__main__.py‎
Lines changed: 78 additions & 0 deletions
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pattern-Based Q/DQ Autotuning for ONNX Models.
+
+This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
+in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
+region analysis to efficiently explore and optimize Q/DQ insertion strategies.
+
+**Core Components:**
+
+Autotuner Classes:
+    - QDQAutotuner: Main autotuner with automatic hierarchical region discovery
+    - QDQAutotunerBase: Base class for custom region identification strategies
+
+Region Management:
+    - Region: Hierarchical subgraph representation (nodes + children)
+    - RegionType: Enumeration (LEAF, COMPOSITE, ROOT)
+    - CombinedRegionSearch: Two-phase region discovery (partitioning + refinement)
+    - RegionPattern: Structural pattern analysis and matching for region grouping
+
+Q/DQ Insertion Points:
+    - InsertionScheme: Collection of Q/DQ insertion points for a region pattern
+    - NodeInputInsertionPoint: Q/DQ insertion at specific node inputs
+    - ChildRegionInputInsertionPoint: Q/DQ insertion at child region input boundaries
+    - RegionOutputInsertionPoint: Q/DQ insertion at region output boundaries
+
+Configuration & State:
+    - Config: Autotuning parameters (quant type, thresholds, verbosity)
+    - PatternCache: Top-performing schemes indexed by pattern (warm-start)
+    - PatternSchemes: Scheme collection and measurement results for a pattern
+
+Benchmarking:
+    - Benchmark: Abstract base class for model benchmarking
+    - TensorRTPyBenchmark: Benchmark using TensorRT Python API (recommended)
+    - TrtExecBenchmark: Benchmark using trtexec command-line tool (legacy)
+
+**See Also:**
+
+    - workflows.region_pattern_autotuning_workflow: Complete end-to-end optimization
+    - QDQAutotuner: Main autotuner class documentation
+    - RegionPattern: Pattern matching and signature computation
+"""
+
+# Core data structures
+from .common import (
+    AutotunerError,
+    AutotunerNotInitializedError,
+    Config,
+    InsertionScheme,
+    InvalidSchemeError,
+    PatternCache,
+    PatternSchemes,
+    Region,
+    RegionError,
+    RegionType,
+)
+from .insertion_points import (
+    ChildRegionInputInsertionPoint,
+    NodeInputInsertionPoint,
+    RegionOutputInsertionPoint,
+    ResolvedInsertionPoint,
+)
+from .region_pattern import RegionPattern
+from .region_search import CombinedRegionSearch
+
+__all__ = [
+    "AutotunerError",
+    "AutotunerNotInitializedError",
+    "ChildRegionInputInsertionPoint",
+    "CombinedRegionSearch",
+    "Config",
+    "InsertionScheme",
+    "InvalidSchemeError",
+    "NodeInputInsertionPoint",
+    "PatternCache",
+    "PatternSchemes",
+    "Region",
+    "RegionError",
+    "RegionOutputInsertionPoint",
+    "RegionPattern",
+    "RegionType",
+    "ResolvedInsertionPoint",
+]
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""ONNX Q/DQ Autotuning Command-Line Interface.
+
+This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize)
+optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance
+measurement to find optimal Q/DQ insertion points that minimize inference latency.
+
+**Usage Examples:**
+
+    # Basic usage - automatic region discovery and optimization
+    python -m modelopt.onnx.quantization.autotune --model model.onnx
+
+    # INT8 vs FP8 quantization
+    python -m modelopt.onnx.quantization.autotune --model model.onnx --quant_type fp8
+
+    # Warm-start from pattern cache (transfer learning)
+    python -m modelopt.onnx.quantization.autotune \\
+        --model model.onnx \\
+        --pattern_cache ./output/pattern_cache.yaml
+
+    # Import patterns from pre-quantized baseline model
+    python -m modelopt.onnx.quantization.autotune \\
+        --model model.onnx \\
+        --qdq_baseline quantized_baseline.onnx
+
+    # Full example with all optimization options
+    python -m modelopt.onnx.quantization.autotune \\
+        --model model.onnx \\
+        --schemes_per_region 50 \\
+        --pattern_cache pattern_cache.yaml \\
+        --qdq_baseline baseline.onnx \\
+        --output ./results \\
+        --quant_type int8 \\
+        --verbose
+
+    # Use custom TensorRT plugins for model-specific operations
+    python -m modelopt.onnx.quantization.autotune \\
+        --model model.onnx \\
+        --plugin_libraries /path/to/plugin1.so /path/to/plugin2.so
+"""
+
+import sys
+
+from modelopt.onnx.quantization.autotune.cli import get_autotune_parser, run_autotune
+
+
+def main():
+    """Command-line entry point for ONNX Q/DQ autotuning.
+
+    Parses command-line arguments and executes the autotuning workflow.
+
+    Returns:
+        Exit code from run_autotune (0 for success, non-zero for errors)
+    """
+    parser = get_autotune_parser()
+    args = parser.parse_args()
+
+    # Run autotuning
+    return run_autotune(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())