Skip to content

Commit 5af83a3

Browse files
committed
Integrate Automated QDQ placement tool - part 3
Signed-off-by: Will Guo <willg@nvidia.com>
1 parent 8c6de51 commit 5af83a3

File tree

10 files changed

+4004
-0
lines changed

10 files changed

+4004
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Pattern-Based Q/DQ Autotuning for ONNX Models.
17+
18+
This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
19+
in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
20+
region analysis to efficiently explore and optimize Q/DQ insertion strategies.
21+
22+
**Core Components:**
23+
24+
Autotuner Classes:
25+
- QDQAutotuner: Main autotuner with automatic hierarchical region discovery
26+
- QDQAutotunerBase: Base class for custom region identification strategies
27+
28+
Region Management:
29+
- Region: Hierarchical subgraph representation (nodes + children)
30+
- RegionType: Enumeration (LEAF, COMPOSITE, ROOT)
31+
- CombinedRegionSearch: Two-phase region discovery (partitioning + refinement)
32+
- RegionPattern: Structural pattern analysis and matching for region grouping
33+
34+
Q/DQ Insertion Points:
35+
- InsertionScheme: Collection of Q/DQ insertion points for a region pattern
36+
- NodeInputInsertionPoint: Q/DQ insertion at specific node inputs
37+
- ChildRegionInputInsertionPoint: Q/DQ insertion at child region input boundaries
38+
- RegionOutputInsertionPoint: Q/DQ insertion at region output boundaries
39+
40+
Configuration & State:
41+
- Config: Autotuning parameters (quant type, thresholds, verbosity)
42+
- PatternCache: Top-performing schemes indexed by pattern (warm-start)
43+
- PatternSchemes: Scheme collection and measurement results for a pattern
44+
45+
Benchmarking:
46+
- Benchmark: Abstract base class for model benchmarking
47+
- TensorRTPyBenchmark: Benchmark using TensorRT Python API (recommended)
48+
- TrtExecBenchmark: Benchmark using trtexec command-line tool (legacy)
49+
50+
**See Also:**
51+
52+
- workflows.region_pattern_autotuning_workflow: Complete end-to-end optimization
53+
- QDQAutotuner: Main autotuner class documentation
54+
- RegionPattern: Pattern matching and signature computation
55+
"""
56+
57+
# Core data structures
58+
from .common import (
59+
AutotunerError,
60+
AutotunerNotInitializedError,
61+
Config,
62+
InsertionScheme,
63+
InvalidSchemeError,
64+
PatternCache,
65+
PatternSchemes,
66+
Region,
67+
RegionError,
68+
RegionType,
69+
)
70+
from .insertion_points import (
71+
ChildRegionInputInsertionPoint,
72+
NodeInputInsertionPoint,
73+
RegionOutputInsertionPoint,
74+
ResolvedInsertionPoint,
75+
)
76+
from .region_pattern import RegionPattern
77+
from .region_search import CombinedRegionSearch
78+
79+
__all__ = [
80+
"AutotunerError",
81+
"AutotunerNotInitializedError",
82+
"ChildRegionInputInsertionPoint",
83+
"CombinedRegionSearch",
84+
"Config",
85+
"InsertionScheme",
86+
"InvalidSchemeError",
87+
"NodeInputInsertionPoint",
88+
"PatternCache",
89+
"PatternSchemes",
90+
"Region",
91+
"RegionError",
92+
"RegionOutputInsertionPoint",
93+
"RegionPattern",
94+
"RegionType",
95+
"ResolvedInsertionPoint",
96+
]
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
r"""ONNX Q/DQ Autotuning Command-Line Interface.
18+
19+
This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize)
20+
optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance
21+
measurement to find optimal Q/DQ insertion points that minimize inference latency.
22+
23+
**Usage Examples:**
24+
25+
# Basic usage - automatic region discovery and optimization
26+
python -m modelopt.onnx.quantization.autotune --model model.onnx
27+
28+
# INT8 vs FP8 quantization
29+
python -m modelopt.onnx.quantization.autotune --model model.onnx --quant_type fp8
30+
31+
# Warm-start from pattern cache (transfer learning)
32+
python -m modelopt.onnx.quantization.autotune \\
33+
--model model.onnx \\
34+
--pattern_cache ./output/pattern_cache.yaml
35+
36+
# Import patterns from pre-quantized baseline model
37+
python -m modelopt.onnx.quantization.autotune \\
38+
--model model.onnx \\
39+
--qdq_baseline quantized_baseline.onnx
40+
41+
# Full example with all optimization options
42+
python -m modelopt.onnx.quantization.autotune \\
43+
--model model.onnx \\
44+
--schemes_per_region 50 \\
45+
--pattern_cache pattern_cache.yaml \\
46+
--qdq_baseline baseline.onnx \\
47+
--output ./results \\
48+
--quant_type int8 \\
49+
--verbose
50+
51+
# Use custom TensorRT plugins for model-specific operations
52+
python -m modelopt.onnx.quantization.autotune \\
53+
--model model.onnx \\
54+
--plugin_libraries /path/to/plugin1.so /path/to/plugin2.so
55+
"""
56+
57+
import sys
58+
59+
from modelopt.onnx.quantization.autotune.cli import get_autotune_parser, run_autotune
60+
61+
62+
def main():
63+
"""Command-line entry point for ONNX Q/DQ autotuning.
64+
65+
Parses command-line arguments and executes the autotuning workflow.
66+
67+
Returns:
68+
Exit code from run_autotune (0 for success, non-zero for errors)
69+
"""
70+
parser = get_autotune_parser()
71+
args = parser.parse_args()
72+
73+
# Run autotuning
74+
return run_autotune(args)
75+
76+
77+
if __name__ == "__main__":
78+
sys.exit(main())

0 commit comments

Comments
 (0)