Skip to content

Commit 202b3e2

Browse files
committed
Integrate Automated QDQ placement tool - part 3
Signed-off-by: Will Guo <[email protected]>
1 parent 8c6de51 commit 202b3e2

File tree

10 files changed

+3865
-0
lines changed

10 files changed

+3865
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Pattern-Based Q/DQ Autotuning for ONNX Models.
17+
18+
This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
19+
in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
20+
region analysis to efficiently explore and optimize Q/DQ insertion strategies.
21+
22+
**Core Components:**
23+
24+
Autotuner Classes:
25+
- QDQAutotuner: Main autotuner with automatic hierarchical region discovery
26+
- QDQAutotunerBase: Base class for custom region identification strategies
27+
28+
Region Management:
29+
- Region: Hierarchical subgraph representation (nodes + children)
30+
- RegionType: Enumeration (LEAF, COMPOSITE, ROOT)
31+
- CombinedRegionSearch: Two-phase region discovery (partitioning + refinement)
32+
- RegionPattern: Structural pattern analysis and matching for region grouping
33+
34+
Q/DQ Insertion Points:
35+
- InsertionScheme: Collection of Q/DQ insertion points for a region pattern
36+
- NodeInputInsertionPoint: Q/DQ insertion at specific node inputs
37+
- ChildRegionInputInsertionPoint: Q/DQ insertion at child region input boundaries
38+
- RegionOutputInsertionPoint: Q/DQ insertion at region output boundaries
39+
40+
Configuration & State:
41+
- Config: Autotuning parameters (quant type, thresholds, verbosity)
42+
- PatternCache: Top-performing schemes indexed by pattern (warm-start)
43+
- PatternSchemes: Scheme collection and measurement results for a pattern
44+
45+
Benchmarking:
46+
- Benchmark: Abstract base class for model benchmarking
47+
- TensorRTPyBenchmark: Benchmark using TensorRT Python API (recommended)
48+
- TrtExecBenchmark: Benchmark using trtexec command-line tool (legacy)
49+
"""
50+
51+
from .common import (
52+
AutotunerError,
53+
AutotunerNotInitializedError,
54+
Config,
55+
InsertionScheme,
56+
InvalidSchemeError,
57+
PatternCache,
58+
PatternSchemes,
59+
Region,
60+
RegionError,
61+
RegionType,
62+
)
63+
from .insertion_points import (
64+
ChildRegionInputInsertionPoint,
65+
NodeInputInsertionPoint,
66+
RegionOutputInsertionPoint,
67+
ResolvedInsertionPoint,
68+
)
69+
from .region_pattern import RegionPattern
70+
from .region_search import CombinedRegionSearch
71+
72+
__all__ = [
73+
"AutotunerError",
74+
"AutotunerNotInitializedError",
75+
"ChildRegionInputInsertionPoint",
76+
"CombinedRegionSearch",
77+
"Config",
78+
"InsertionScheme",
79+
"InvalidSchemeError",
80+
"NodeInputInsertionPoint",
81+
"PatternCache",
82+
"PatternSchemes",
83+
"Region",
84+
"RegionError",
85+
"RegionOutputInsertionPoint",
86+
"RegionPattern",
87+
"RegionType",
88+
"ResolvedInsertionPoint",
89+
]
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
r"""ONNX Q/DQ Autotuning Command-Line Interface.
18+
19+
This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize)
20+
optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance
21+
measurement to find optimal Q/DQ insertion points that minimize inference latency.
22+
23+
**Usage Examples:**
24+
25+
# Basic usage - automatic region discovery and optimization
26+
python -m modelopt.onnx.quantization.autotune --model model.onnx
27+
28+
# INT8 vs FP8 quantization
29+
python -m modelopt.onnx.quantization.autotune --model model.onnx --quant_type fp8
30+
31+
# Warm-start from pattern cache (transfer learning)
32+
python -m modelopt.onnx.quantization.autotune \\
33+
--model model.onnx \\
34+
--pattern_cache ./output/pattern_cache.yaml
35+
36+
# Import patterns from pre-quantized baseline model
37+
python -m modelopt.onnx.quantization.autotune \\
38+
--model model.onnx \\
39+
--qdq_baseline quantized_baseline.onnx
40+
41+
# Full example with all optimization options
42+
python -m modelopt.onnx.quantization.autotune \\
43+
--model model.onnx \\
44+
--schemes_per_region 50 \\
45+
--pattern_cache pattern_cache.yaml \\
46+
--qdq_baseline baseline.onnx \\
47+
--output ./results \\
48+
--quant_type int8 \\
49+
--verbose
50+
51+
# Use custom TensorRT plugins for model-specific operations
52+
python -m modelopt.onnx.quantization.autotune \\
53+
--model model.onnx \\
54+
--plugin_libraries /path/to/plugin1.so /path/to/plugin2.so
55+
"""
56+
57+
import sys
58+
59+
from modelopt.onnx.quantization.autotune.cli import run_autotune
60+
61+
if __name__ == "__main__":
62+
sys.exit(run_autotune())

0 commit comments

Comments
 (0)