From c2dcbe17fa7171bccefc27ed1e3add86c0f8211f Mon Sep 17 00:00:00 2001 From: irenab Date: Sun, 8 Dec 2024 10:54:52 +0200 Subject: [PATCH] add resource utilization calculator --- .../core/common/graph/base_graph.py | 6 +- .../core/common/graph/base_node.py | 16 +- .../mixed_precision_search_facade.py | 6 - .../mixed_precision_search_manager.py | 164 +---- .../resource_utilization.py | 95 +-- .../resource_utilization_calculator.py | 612 ++++++++++++++++++ .../resource_utilization_data.py | 221 +------ .../ru_aggregation_methods.py | 105 --- .../ru_functions_mapping.py | 33 - .../resource_utilization_tools/ru_methods.py | 583 +++++------------ .../search_methods/linear_programming.py | 28 +- .../mixed_precision/sensitivity_evaluation.py | 8 +- .../solution_refinement_procedure.py | 4 +- .../common/quantization/bit_width_config.py | 18 +- model_compression_toolkit/core/runner.py | 74 +-- .../requires_mixed_precision_test.py | 2 +- .../test_lp_search_bitwidth.py | 6 - tests_pytest/core/__init__.py | 14 + tests_pytest/core/common/__init__.py | 14 + .../core/common/mixed_precision/__init__.py | 14 + .../resource_utilization_tools/__init__.py | 14 + .../test_resource_utilization_calculator.py | 51 ++ 22 files changed, 1030 insertions(+), 1058 deletions(-) create mode 100644 model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py delete mode 100644 model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py delete mode 100644 model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py create mode 100644 tests_pytest/core/__init__.py create mode 100644 tests_pytest/core/common/__init__.py create mode 100644 tests_pytest/core/common/mixed_precision/__init__.py create mode 100644 tests_pytest/core/common/mixed_precision/resource_utilization_tools/__init__.py create mode 100644 tests_pytest/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py diff --git a/model_compression_toolkit/core/common/graph/base_graph.py b/model_compression_toolkit/core/common/graph/base_graph.py index 432a81f39..3cae870bf 100644 --- a/model_compression_toolkit/core/common/graph/base_graph.py +++ b/model_compression_toolkit/core/common/graph/base_graph.py @@ -545,9 +545,7 @@ def get_weights_configurable_nodes(self, def is_configurable(n): kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - return (n.is_weights_quantization_enabled(kernel_attr) and - not n.is_all_weights_candidates_equal(kernel_attr) and - (not n.reuse or include_reused_nodes)) + return n.is_configurable_weight(kernel_attr) and (not n.reuse or include_reused_nodes) return [n for n in potential_conf_nodes if is_configurable(n)] @@ -576,7 +574,7 @@ def get_activation_configurable_nodes(self) -> List[BaseNode]: Returns: A list of nodes that their activation can be configured (namely, has one or more activation qc candidate). """ - return [n for n in list(self) if n.is_activation_quantization_enabled() and not n.is_all_activation_candidates_equal()] + return [n for n in list(self) if n.has_configurable_activation()] def get_sorted_activation_configurable_nodes(self) -> List[BaseNode]: """ diff --git a/model_compression_toolkit/core/common/graph/base_node.py b/model_compression_toolkit/core/common/graph/base_node.py index 67c4f2f57..1c39ccae3 100644 --- a/model_compression_toolkit/core/common/graph/base_node.py +++ b/model_compression_toolkit/core/common/graph/base_node.py @@ -150,6 +150,14 @@ def is_weights_quantization_enabled(self, attr_name: str) -> bool: return False + def is_configurable_weight(self, attr_name: str) -> bool: + """ Checks whether the specific weight has a configurable quantization. """ + return self.is_weights_quantization_enabled(attr_name) and not self.is_all_weights_candidates_equal(attr_name) + + def has_configurable_activation(self): + """ Checks whether the activation has a configurable quantization. """ + return self.is_activation_quantization_enabled() and not self.is_all_activation_candidates_equal() + def __repr__(self): """ @@ -420,11 +428,15 @@ def get_total_output_params(self) -> float: Returns: Output size. """ - output_shapes = self.output_shape if isinstance(self.output_shape, List) else [self.output_shape] + # multiple output shapes are not necessarily lists, e.g. tf nms uses custom named tuple. + if self.output_shape and isinstance(self.output_shape[0], (tuple, list)): + output_shapes = list(self.output_shape) + else: + output_shapes = self.output_shape if isinstance(self.output_shape, list) else [self.output_shape] # remove batch size (first element) from output shape output_shapes = [s[1:] for s in output_shapes] - + # for scalar shape (None,) prod returns 1 return sum([np.prod([x for x in output_shape if x is not None]) for output_shape in output_shapes]) def find_min_candidates_indices(self) -> List[int]: diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py index 7f31563a4..2c6dbd638 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_facade.py @@ -22,7 +22,6 @@ from model_compression_toolkit.core.common import Graph from model_compression_toolkit.core.common.hessian import HessianInfoService from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_functions_mapping import ru_functions_mapping from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import MixedPrecisionSearchManager from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ @@ -105,16 +104,11 @@ def search_bit_width(graph_to_search_cfg: Graph, disable_activation_for_metric=disable_activation_for_metric, hessian_info_service=hessian_info_service) - # Each pair of (resource utilization method, resource utilization aggregation) should match to a specific - # provided target resource utilization - ru_functions = ru_functions_mapping - # Instantiate a manager object search_manager = MixedPrecisionSearchManager(graph, fw_info, fw_impl, se, - ru_functions, target_resource_utilization, original_graph=graph_to_search_cfg) diff --git a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py index 7fbb0807b..f81d776f7 100644 --- a/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py +++ b/model_compression_toolkit/core/common/mixed_precision/mixed_precision_search_manager.py @@ -13,23 +13,24 @@ # limitations under the License. # ============================================================================== -from typing import Callable, Tuple -from typing import Dict, List +from typing import Callable, Dict, List + import numpy as np from model_compression_toolkit.core.common import BaseNode -from model_compression_toolkit.logger import Logger from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation +from model_compression_toolkit.core.common.framework_info import FrameworkInfo from model_compression_toolkit.core.common.graph.base_graph import Graph from model_compression_toolkit.core.common.graph.virtual_activation_weights_node import VirtualActivationWeightsNode, \ VirtualSplitWeightsNode, VirtualSplitActivationNode -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import RUTarget, ResourceUtilization -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_functions_mapping import RuFunctions -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_aggregation_methods import MpRuAggregation -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import MpRuMetric, calc_graph_cuts -from model_compression_toolkit.core.common.graph.memory_graph.compute_graph_max_cut import Cut -from model_compression_toolkit.core.common.framework_info import FrameworkInfo +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + RUTarget, ResourceUtilization +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \ + ResourceUtilizationCalculator, TargetInclusionCriterion, BitwidthMode +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import \ + MixPrecisionRUHelper from model_compression_toolkit.core.common.mixed_precision.sensitivity_evaluation import SensitivityEvaluation +from model_compression_toolkit.logger import Logger class MixedPrecisionSearchManager: @@ -42,7 +43,6 @@ def __init__(self, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation, sensitivity_evaluator: SensitivityEvaluation, - ru_functions: Dict[RUTarget, RuFunctions], target_resource_utilization: ResourceUtilization, original_graph: Graph = None): """ @@ -53,8 +53,6 @@ def __init__(self, fw_impl: FrameworkImplementation object with specific framework methods implementation. sensitivity_evaluator: A SensitivityEvaluation which provides a function that evaluates the sensitivity of a bit-width configuration for the MP model. - ru_functions: A dictionary with pairs of (MpRuMethod, MpRuAggregationMethod) mapping a RUTarget to - a couple of resource utilization metric function and resource utilization aggregation function. target_resource_utilization: Target Resource Utilization to bound our feasible solution space s.t the configuration does not violate it. original_graph: In case we have a search over a virtual graph (if we have BOPS utilization target), then this argument will contain the original graph (for config reconstruction purposes). @@ -69,29 +67,17 @@ def __init__(self, self.compute_metric_fn = self.get_sensitivity_metric() self._cuts = None - ru_types = [ru_target for ru_target, ru_value in - target_resource_utilization.get_resource_utilization_dict().items() if ru_value < np.inf] - self.compute_ru_functions = {ru_target: ru_fn for ru_target, ru_fn in ru_functions.items() if ru_target in ru_types} + self.ru_metrics = target_resource_utilization.get_restricted_metrics() + self.ru_helper = MixPrecisionRUHelper(graph, fw_info, fw_impl) self.target_resource_utilization = target_resource_utilization self.min_ru_config = self.graph.get_min_candidates_config(fw_info) self.max_ru_config = self.graph.get_max_candidates_config(fw_info) - self.min_ru = self.compute_min_ru() + self.min_ru = self.ru_helper.compute_utilization(self.ru_metrics, self.min_ru_config) self.non_conf_ru_dict = self._non_configurable_nodes_ru() self.config_reconstruction_helper = ConfigReconstructionHelper(virtual_graph=self.graph, original_graph=self.original_graph) - @property - def cuts(self) -> List[Cut]: - """ - Calculates graph cuts. Written as property, so it will only be calculated once and - only if cuts are needed. - - """ - if self._cuts is None: - self._cuts = calc_graph_cuts(self.original_graph) - return self._cuts - def get_search_space(self) -> Dict[int, List[int]]: """ The search space is a mapping from a node's index to a list of integers (possible bitwidths candidates indeces @@ -122,40 +108,6 @@ def get_sensitivity_metric(self) -> Callable: return self.sensitivity_evaluator.compute_metric - def _calc_ru_fn(self, ru_target, ru_fn, mp_cfg) -> np.ndarray: - """ - Computes a resource utilization for a certain mixed precision configuration. - The method computes a resource utilization vector for specific target resource utilization. - - Returns: resource utilization value. - - """ - # ru_fn is a pair of resource utilization computation method and - # resource utilization aggregation method (in this method we only need the first one) - if ru_target is RUTarget.ACTIVATION: - return ru_fn.metric_fn(mp_cfg, self.graph, self.fw_info, self.fw_impl, self.cuts) - else: - return ru_fn.metric_fn(mp_cfg, self.graph, self.fw_info, self.fw_impl) - - def compute_min_ru(self) -> Dict[RUTarget, np.ndarray]: - """ - Computes a resource utilization vector with the values matching to the minimal mp configuration - (i.e., each node is configured with the quantization candidate that would give the minimal size of the - node's resource utilization). - The method computes the minimal resource utilization vector for each target resource utilization. - - Returns: A dictionary mapping each target resource utilization to its respective minimal - resource utilization values. - - """ - min_ru = {} - for ru_target, ru_fn in self.compute_ru_functions.items(): - # ru_fns is a pair of resource utilization computation method and - # resource utilization aggregation method (in this method we only need the first one) - min_ru[ru_target] = self._calc_ru_fn(ru_target, ru_fn, self.min_ru_config) - - return min_ru - def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: """ Computes and builds a resource utilization matrix, to be used for the mixed-precision search problem formalization. @@ -184,7 +136,8 @@ def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: # always be 0 for all entries in the results vector. candidate_rus = np.zeros(shape=self.min_ru[target].shape) else: - candidate_rus = self.compute_candidate_relative_ru(c, candidate_idx, target) + candidate_rus = self.compute_node_ru_for_candidate(c, candidate_idx, target) - self.min_ru[target] + ru_matrix.append(np.asarray(candidate_rus)) # We need to transpose the calculated ru matrix to allow later multiplication with @@ -195,40 +148,6 @@ def compute_resource_utilization_matrix(self, target: RUTarget) -> np.ndarray: np_ru_matrix = np.array(ru_matrix) return np.moveaxis(np_ru_matrix, source=0, destination=len(np_ru_matrix.shape) - 1) - def compute_candidate_relative_ru(self, - conf_node_idx: int, - candidate_idx: int, - target: RUTarget) -> np.ndarray: - """ - Computes a resource utilization vector for a given candidates of a given configurable node, - i.e., the matching resource utilization vector which is obtained by computing the given target's - resource utilization function on a minimal configuration in which the given - layer's candidates is changed to the new given one. - The result is normalized by subtracting the target's minimal resource utilization vector. - - Args: - conf_node_idx: The index of a node in a sorted configurable nodes list. - candidate_idx: The index of a node's quantization configuration candidate. - target: The target for which the resource utilization is calculated (a RUTarget value). - - Returns: Normalized node's resource utilization vector - - """ - return self.compute_node_ru_for_candidate(conf_node_idx, candidate_idx, target) - \ - self.get_min_target_resource_utilization(target) - - def get_min_target_resource_utilization(self, target: RUTarget) -> np.ndarray: - """ - Returns the minimal resource utilization vector (pre-calculated on initialization) of a specific target. - - Args: - target: The target for which the resource utilization is calculated (a RUTarget value). - - Returns: Minimal resource utilization vector. - - """ - return self.min_ru[target] - def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int, target: RUTarget) -> np.ndarray: """ Computes a resource utilization vector after replacing the given node's configuration candidate in the minimal @@ -243,7 +162,8 @@ def compute_node_ru_for_candidate(self, conf_node_idx: int, candidate_idx: int, """ cfg = self.replace_config_in_index(self.min_ru_config, conf_node_idx, candidate_idx) - return self._calc_ru_fn(target, self.compute_ru_functions[target], cfg) + # TODO compute for all targets at once. Currently the way up to add_set_of_ru_constraints is per target. + return self.ru_helper.compute_utilization({target}, cfg)[target] @staticmethod def replace_config_in_index(mp_cfg: List[int], idx: int, value: int) -> List[int]: @@ -270,21 +190,10 @@ def _non_configurable_nodes_ru(self) -> Dict[RUTarget, np.ndarray]: Returns: A mapping between a RUTarget and its non-configurable nodes' resource utilization vector. """ - - non_conf_ru_dict = {} - for target, ru_fns in self.compute_ru_functions.items(): - # Call for the ru method of the given target - empty quantization configuration list is passed since we - # compute for non-configurable nodes - if target == RUTarget.BOPS: - ru_vector = None - elif target == RUTarget.ACTIVATION: - ru_vector = ru_fns.metric_fn([], self.graph, self.fw_info, self.fw_impl, self.cuts) - else: - ru_vector = ru_fns.metric_fn([], self.graph, self.fw_info, self.fw_impl) - - non_conf_ru_dict[target] = ru_vector - - return non_conf_ru_dict + ru_metrics = self.ru_metrics - {RUTarget.BOPS} + ru = self.ru_helper.compute_utilization(ru_targets=ru_metrics, mp_cfg=None) + ru[RUTarget.BOPS] = None + return ru def compute_resource_utilization_for_config(self, config: List[int]) -> ResourceUtilization: """ @@ -297,29 +206,14 @@ def compute_resource_utilization_for_config(self, config: List[int]) -> Resource with the given config. """ - - ru_dict = {} - for ru_target, ru_fns in self.compute_ru_functions.items(): - # Passing False to ru methods and aggregations to indicates that the computations - # are not for constraints setting - if ru_target == RUTarget.BOPS: - configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.original_graph, self.fw_info, self.fw_impl, False) - elif ru_target == RUTarget.ACTIVATION: - configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.graph, self.fw_info, self.fw_impl, self.cuts) - else: - configurable_nodes_ru_vector = ru_fns.metric_fn(config, self.original_graph, self.fw_info, self.fw_impl) - non_configurable_nodes_ru_vector = self.non_conf_ru_dict.get(ru_target) - if non_configurable_nodes_ru_vector is None or len(non_configurable_nodes_ru_vector) == 0: - ru_ru = self.compute_ru_functions[ru_target].aggregate_fn(configurable_nodes_ru_vector, False) - else: - ru_ru = self.compute_ru_functions[ru_target].aggregate_fn( - np.concatenate([configurable_nodes_ru_vector, non_configurable_nodes_ru_vector]), False) - - ru_dict[ru_target] = ru_ru[0] - - config_ru = ResourceUtilization() - config_ru.set_resource_utilization_by_target(ru_dict) - return config_ru + act_qcs, w_qcs = self.ru_helper.get_configurable_qcs(config) + # TODO on graph or on orig graph??? + ru_calc = ResourceUtilizationCalculator(self.graph, self.fw_impl, self.fw_info) + ru = ru_calc.compute_resource_utilization(target_criterion=TargetInclusionCriterion.AnyQuantized, + bitwidth_mode=BitwidthMode.MpCustom, + act_qcs=act_qcs, + w_qcs=w_qcs) + return ru def finalize_distance_metric(self, layer_to_metrics_mapping: Dict[int, Dict[int, float]]): """ diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py index 934d24f01..80fa900d5 100644 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization.py @@ -12,29 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +from dataclasses import dataclass from enum import Enum -from typing import Dict, Any +from typing import Dict, Any, Set import numpy as np class RUTarget(Enum): """ - Targets for which we define Resource Utilization metrics for mixed-precision search. - For each target that we care to consider in a mixed-precision search, there should be defined a set of - resource utilization computation function, resource utilization aggregation function, - and resource utilization target (within a ResourceUtilization object). - - Whenever adding a resource utilization metric to ResourceUtilization class we should add a matching target to this enum. - - WEIGHTS - Weights memory ResourceUtilization metric. - - ACTIVATION - Activation memory ResourceUtilization metric. - - TOTAL - Total memory ResourceUtilization metric. - - BOPS - Total Bit-Operations ResourceUtilization Metric. + Resource Utilization targets for mixed-precision search. + WEIGHTS - Weights memory. + ACTIVATION - Activation memory. + TOTAL - Total memory. + BOPS - Total Bit-Operations. """ WEIGHTS = 'weights' @@ -43,34 +35,21 @@ class RUTarget(Enum): BOPS = 'bops' +@dataclass class ResourceUtilization: """ Class to represent measurements of performance. - """ - - def __init__(self, - weights_memory: float = np.inf, - activation_memory: float = np.inf, - total_memory: float = np.inf, - bops: float = np.inf): - """ - - Args: - weights_memory: Memory of a model's weights in bytes. Note that this includes only coefficients that should be quantized (for example, the kernel of Conv2D in Keras will be affected by this value, while the bias will not). - activation_memory: Memory of a model's activation in bytes, according to the given activation resource utilization metric. - total_memory: The sum of model's activation and weights memory in bytes, according to the given total resource utilization metric. - bops: The total bit-operations in the model. - """ - self.weights_memory = weights_memory - self.activation_memory = activation_memory - self.total_memory = total_memory - self.bops = bops - def __repr__(self): - return f"Weights_memory: {self.weights_memory}, " \ - f"Activation_memory: {self.activation_memory}, " \ - f"Total_memory: {self.total_memory}, " \ - f"BOPS: {self.bops}" + weights_memory: Memory of a model's weights in bytes. + activation_memory: Memory of a model's activation in bytes. + total_memory: The sum of model's activation and weights memory in bytes. + bops: The total bit-operations in the model. + """ + # TODO the user facade actually computes size, not memory. Do we want to change fields names? + weights_memory: float = np.inf + activation_memory: float = np.inf + total_memory: float = np.inf + bops: float = np.inf def weight_restricted(self): return self.weights_memory < np.inf @@ -93,34 +72,24 @@ def get_resource_utilization_dict(self) -> Dict[RUTarget, float]: RUTarget.TOTAL: self.total_memory, RUTarget.BOPS: self.bops} - def set_resource_utilization_by_target(self, ru_mapping: Dict[RUTarget, float]): + def is_satisfied_by(self, ru: 'ResourceUtilization') -> bool: """ - Setting a ResourceUtilization object values for each ResourceUtilization target in the given dictionary. + Checks whether another ResourceUtilization object satisfies the constraints defined by the current object. Args: - ru_mapping: A mapping from a RUTarget to a matching resource utilization value. - - """ - self.weights_memory = ru_mapping.get(RUTarget.WEIGHTS, np.inf) - self.activation_memory = ru_mapping.get(RUTarget.ACTIVATION, np.inf) - self.total_memory = ru_mapping.get(RUTarget.TOTAL, np.inf) - self.bops = ru_mapping.get(RUTarget.BOPS, np.inf) + ru: A ResourceUtilization object to check against the current object. - def holds_constraints(self, ru: Any) -> bool: + Returns: + Whether all constraints are satisfied. """ - Checks whether the given ResourceUtilization object holds a set of ResourceUtilization constraints defined by - the current ResourceUtilization object. + return bool(ru.weights_memory <= self.weights_memory and \ + ru.activation_memory <= self.activation_memory and \ + ru.total_memory <= self.total_memory and \ + ru.bops <= self.bops) - Args: - ru: A ResourceUtilization object to check if it holds the constraints. - - Returns: True if all the given resource utilization values are not greater than the referenced resource utilization values. - - """ - if not isinstance(ru, ResourceUtilization): - return False + def get_restricted_metrics(self) -> Set[RUTarget]: + d = self.get_resource_utilization_dict() + return {k for k, v in d.items() if v < np.inf} - return ru.weights_memory <= self.weights_memory and \ - ru.activation_memory <= self.activation_memory and \ - ru.total_memory <= self.total_memory and \ - ru.bops <= self.bops + def is_any_restricted(self) -> bool: + return bool(self.get_restricted_metrics()) diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py new file mode 100644 index 000000000..d8459976f --- /dev/null +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_calculator.py @@ -0,0 +1,612 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from collections import defaultdict +from copy import deepcopy +from enum import Enum, auto +from functools import lru_cache +from typing import Dict, Any, NamedTuple, Callable, Optional, Tuple, List, Iterable, Union, Literal + +from model_compression_toolkit.constants import FLOAT_BITWIDTH +from model_compression_toolkit.core import FrameworkInfo +from model_compression_toolkit.core.common import Graph, BaseNode +from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation +from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX +from model_compression_toolkit.core.common.graph.memory_graph.compute_graph_max_cut import compute_graph_max_cut +from model_compression_toolkit.core.common.graph.memory_graph.cut import Cut +from model_compression_toolkit.core.common.graph.memory_graph.memory_graph import MemoryGraph +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + RUTarget, ResourceUtilization +from model_compression_toolkit.core.common.quantization.node_quantization_config import NodeWeightsQuantizationConfig, \ + NodeActivationQuantizationConfig + + +class BitwidthMode(Enum): + """ + Bit-width configuration for resource utilization computation. + + Size: tensors sizes. + Float: float. + MpMax: maximal bit-width mixed-precision configuration. + MpMin: minimal bit-width mixed-precision configuration. + MpCustom: explicitly provided bit-width configuration. + SpDefault: single-precision configuration (for non-configurable quantization). + """ + Size = auto() + Float = auto() + MpMax = auto() + MpMin = auto() + MpCustom = auto() + SpDefault = auto() + + +class TargetInclusionCriterion(Enum): + """ + Target nodes / parameters to include for resource utilization computation. + + QConfigurable: configurable for Mixed Precision targets (multiple quantization candidates). + QNonConfigurable: non-configurable targets (single quantization candidate). + AnyQuantized: any quantized targets (configurable and non-configurable). + Any: all targets (quantized + float). + """ + QConfigurable = auto() + QNonConfigurable = auto() + AnyQuantized = auto() + Any = auto() + + +class Utilization(NamedTuple): + """ + Utility container for a single resource utilization result. + Supports sum, max, min over an iterable of Utilization objects. + + Args: + size: parameters or activation tensor(s) size. + bytes: memory utilization. + """ + size: int + bytes: Optional[float] + + def by_bit_mode(self, bitwidth_mode: BitwidthMode) -> Union[int, float]: + """ Retrieve value corresponding to the bit-width mode. """ + if bitwidth_mode == BitwidthMode.Size: + return self.size + return self.bytes + + @staticmethod + def zero_utilization(bitwidth_mode: BitwidthMode) -> 'Utilization': + """ Construct zero utilization object. """ + return Utilization(0, bytes=None if bitwidth_mode == BitwidthMode.Size else 0) + + def __add__(self, other: 'Utilization') -> 'Utilization': + if [self.bytes, other.bytes].count(None) == 1: + raise ValueError('bytes field must be set either by both or by none of the objects.') + bytes_ = None if self.bytes is None else (self.bytes + other.bytes) + return Utilization(self.size + other.size, bytes_) + + def __radd__(self, other: Union['Utilization', Literal[0]]): + # Needed for sum (with default start_value=0). + if other == 0: + return self + return self + other + + def __gt__(self, other: 'Utilization'): + # Needed for max / min. Compare by bytes, if not defined then by size. + if [self.bytes, other.bytes].count(None) == 1: + raise ValueError('bytes field must be set either by both or by none of the objects.') + if self.bytes is not None: + return self.bytes > other.bytes + return self.size > other.size + + +class AggregationMethod(Enum): + SUM = sum + MAX = lambda seq: max(seq) if (seq := list(seq)) else 0 # walrus op for empty generator + + def __call__(self, *args, **kwarg): + return self.value(*args, **kwarg) + + +# default aggregation methods +# TODO This is used by mp to use the same aggregation. Except that for total it must do its own thing (add indicators +# to weights before summation). So maybe just get rid of it altogether? If it ever becomes configurable we can add it. +ru_target_aggregation_fn = { + RUTarget.WEIGHTS: AggregationMethod.SUM, + RUTarget.ACTIVATION: AggregationMethod.MAX, + RUTarget.TOTAL: AggregationMethod.SUM, + RUTarget.BOPS: AggregationMethod.SUM +} + + +_bitwidth_mode_fn = { + BitwidthMode.MpMax: max, + BitwidthMode.MpMin: min +} + + +class ResourceUtilizationCalculator: + """ Resource utilization calculator. """ + + def __init__(self, graph: Graph, fw_impl: FrameworkImplementation, fw_info: FrameworkInfo): + self.graph = graph + self.fw_impl = fw_impl + self.fw_info = fw_info + + # Currently we go over the full graph even if utilization won't be requested for all nodes. + # We could fill the cache on the fly only for requested nodes, but it's probably negligible. + self._act_tensors_size = {} + self._params_cnt = {} + for n in graph.nodes: + self._act_tensors_size[n] = n.get_total_output_params() + self._params_cnt[n] = {k: v.size for k, v in n.weights.items()} + self._cuts = None + + def compute_resource_utilization(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]] = None, + w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]] = None, + metrics: Iterable[RUTarget] = None) -> ResourceUtilization: + """ + Compute total resource utilization. + + Args: + target_criterion: criterion to include targets for computation (applies to weights, activation). + bitwidth_mode: bit-width mode for computation. + act_qcs: activation quantization candidates for custom bit-width mode. + w_qcs: weights quantization candidates for custom bit-width mode. + metrics: metrics to include for computation. If None, all metrics are calculated. + + Returns: + Resource utilization object. + """ + metrics = metrics or set(RUTarget) + + w_total, a_total = None, None + if {RUTarget.WEIGHTS, RUTarget.TOTAL}.intersection(set(metrics)): + w_total, *_ = self.compute_weights_utilization(target_criterion, bitwidth_mode, w_qcs) + elif w_qcs is not None: + raise ValueError('Weight configuration passed but no relevant metric requested.') + + if {RUTarget.ACTIVATION, RUTarget.TOTAL}.intersection(set(metrics)): + a_total, *_ = self.compute_activations_utilization(target_criterion, bitwidth_mode, act_qcs) + elif act_qcs is not None: + raise ValueError('Activation configuration passed but no relevant metric requested.') + + ru = ResourceUtilization() + if RUTarget.WEIGHTS in metrics: + ru.weights_memory = w_total + if RUTarget.ACTIVATION in metrics: + ru.activation_memory = a_total + if RUTarget.TOTAL in metrics: + ru.total_memory = w_total + a_total + if RUTarget.BOPS in metrics: + ru.bops, _ = self.compute_bops(target_criterion=target_criterion, + bitwidth_mode=bitwidth_mode, act_qcs=act_qcs, w_qcs=w_qcs) + + assert ru.get_restricted_metrics() == set(metrics), 'Mismatch between the number of requested and computed metrics' + return ru + + def compute_weights_utilization(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]] = None) \ + -> Tuple[float, Dict[BaseNode, Utilization], Dict[BaseNode, Dict[str, Utilization]]]: + """ + Compute graph's weights resource utilization. + + Args: + target_criterion: criterion to include targets for computation. + bitwidth_mode: bit-width mode for computation. + w_qcs: weights quantization config per node for custom bit mode. Must contain all configurable weights. + + Returns: + - Total weights utilization. + - Per node total utilization. Dict keys are nodes in a topological order. + - Detailed per node per weight utilization. Dict keys are nodes in a topological order. + """ + nodes = self._get_target_weight_nodes(target_criterion, include_reused=False) + + util_per_node: Dict[BaseNode, Utilization] = {} + util_per_node_per_weight = {} + + for n in self._topo_sort(nodes): + w_qc = w_qcs.get(n) if w_qcs else None + node_weights_util, per_weight_util = self.compute_node_weights_utilization(n, target_criterion, + bitwidth_mode, w_qc) + util_per_node[n] = node_weights_util + util_per_node_per_weight[n] = per_weight_util + + aggregate_fn = ru_target_aggregation_fn[RUTarget.WEIGHTS] + total_util = aggregate_fn(u.by_bit_mode(bitwidth_mode) for u in util_per_node.values()) + return total_util, util_per_node, util_per_node_per_weight + + def compute_node_weights_utilization(self, + n: BaseNode, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + qc: NodeWeightsQuantizationConfig)\ + -> Tuple[Utilization, Dict[str, Utilization]]: + """ + Compute resource utilization for weights of a node. + + Args: + n: node. + target_criterion: criterion to include weights for computation. + bitwidth_mode: bit-width mode for the computation. + qc: weight quantization config for custom bit mode computation. Must contain all configurable weights. + + Returns: + - Total utilization + - Detailed per weight utilization. + """ + weight_attrs = self._get_target_weight_attrs(n, target_criterion) + if not weight_attrs: + return Utilization.zero_utilization(bitwidth_mode, ), {} + + attr_util = {} + for attr in weight_attrs: + size = self._params_cnt[n][attr] + bytes_ = None + if bitwidth_mode != BitwidthMode.Size: + nbits = self._get_weight_nbits(n, attr, bitwidth_mode, qc) + bytes_ = size * nbits / 8 + attr_util[attr] = Utilization(size, bytes_) + + total_weights = sum(attr_util.values()) + return total_weights, attr_util + + def compute_activations_utilization(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]] = None): + return self.compute_cut_activation_utilization(target_criterion, bitwidth_mode, act_qcs) + + def compute_cut_activation_utilization(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]) \ + -> Tuple[float, Dict[Cut, Utilization], Dict[Cut, Dict[BaseNode, Utilization]]]: + """ + Calculate graph activation cuts utilization. + + Args: + target_criterion: criterion to include weights for computation. + bitwidth_mode: bit-width mode for the computation. + act_qcs: custom configuration for BitwidthMode.MpCustom. Must contain all configurable nodes. + + Returns: + - Total utilization + - Total utilization per cut. + - Detailed utilization per cut per node. + """ + if target_criterion != TargetInclusionCriterion.AnyQuantized: + raise NotImplementedError('Computing MaxCut activation utilization is currently only supported for quantized targets.') + + if self._cuts is None: + memory_graph = MemoryGraph(deepcopy(self.graph)) + _, _, cuts = compute_graph_max_cut(memory_graph) + if cuts is None: + raise RuntimeError("Failed to calculate activation memory cuts for graph.") # pragma: no cover + cuts = [cut for cut in cuts if cut.mem_elements.elements] + self._cuts = cuts + + util_per_cut: Dict[Cut, Utilization] = {} # type: ignore + util_per_cut_per_node = defaultdict(dict) + for cut in self._cuts: + target_nodes = self._get_cut_target_nodes(cut, target_criterion) + if not target_nodes: + continue + for n in target_nodes: + qc = act_qcs.get(n) if act_qcs else None + util_per_cut_per_node[cut][n] = self.compute_node_activation_tensor_utilization(n, target_criterion, + bitwidth_mode, qc) + util_per_cut[cut] = sum(util_per_cut_per_node[cut].values()) # type: ignore + + aggregate_fn = ru_target_aggregation_fn[RUTarget.ACTIVATION] + total_util = aggregate_fn(u.by_bit_mode(bitwidth_mode) for u in util_per_cut.values()) + return total_util, util_per_cut, util_per_cut_per_node + + def compute_activation_tensors_utilization(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]] = None, + include_reused=False) \ + -> Tuple[float, Dict[BaseNode, Utilization]]: + """ + Compute resource utilization for graph's activations tensors. + + Args: + target_criterion: criterion to include weights for computation. + bitwidth_mode: bit-width mode for the computation. + act_qcs: custom configuration for BitwidthMode.MpCustom. Must contain all configurable nodes. + include_reused: whether to consider reused nodes. + Returns: + Total activation utilization and a dict containing utilization per node. + + """ + nodes = self._get_target_activation_nodes(target_criterion, include_reused=include_reused) + util_per_node: Dict[BaseNode, Utilization] = {} + for n in self._topo_sort(nodes): + qc = act_qcs.get(n) if act_qcs else None + util = self.compute_node_activation_tensor_utilization(n, None, bitwidth_mode, qc) + util_per_node[n] = util + + aggregate_fn = ru_target_aggregation_fn[RUTarget.ACTIVATION] + total_util = aggregate_fn(u.by_bit_mode(bitwidth_mode) for u in util_per_node.values()) + return total_util, util_per_node + + def compute_node_activation_tensor_utilization(self, + n: BaseNode, + target_criterion: Optional[TargetInclusionCriterion], + bitwidth_mode: BitwidthMode, + qc: Optional[NodeActivationQuantizationConfig]) -> Utilization: + """ + Compute activation resource utilization for a node. + + Args: + n: node. + target_criterion: criterion to include nodes for computation. If None, will skip the check. + bitwidth_mode: bit-width mode for the computation. + qc: activation quantization config for custom bit mode. Must be passed for a configurable activation. + + Returns: + Node's activation utilization. + """ + if target_criterion: + nodes = self._get_target_activation_nodes(target_criterion=target_criterion, include_reused=True, nodes=[n]) + if not nodes: + return Utilization.zero_utilization(bitwidth_mode) + + size = self._act_tensors_size[n] + bytes_ = None + if bitwidth_mode != BitwidthMode.Size: + nbits = self._get_activation_nbits(n, bitwidth_mode, qc) + bytes_ = size * nbits / 8 + return Utilization(size, bytes_) + + def compute_bops(self, + target_criterion: TargetInclusionCriterion, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]] = None, + w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]] = None) \ + -> Tuple[int, Dict[BaseNode, int]]: + """ + Compute bit operations based on nodes with kernel. + + Args: + target_criterion: criterion to include nodes for computation. + bitwidth_mode: bit-width mode for computation. + act_qcs: activation quantization candidates for custom bit-width mode. + w_qcs: weights quantization candidates for custom bit-width mode. + + Returns: + - Total BOPS count. + - Detailed BOPS count per node. + """ + # currently we compute bops for all nodes with quantized weights, regardless of whether the input + # activation is quantized. + if target_criterion != TargetInclusionCriterion.AnyQuantized: + raise NotImplementedError('BOPS computation is currently only supported for quantized targets.') + + nodes = [n for n in self.graph.nodes if n.has_kernel_weight_to_quantize(self.fw_info)] + nodes_bops = {} + for n in nodes: + w_qc = w_qcs.get(n) if w_qcs else None + nodes_bops[n] = self.compute_node_bops(n, bitwidth_mode, act_qcs=act_qcs, w_qc=w_qc) + + aggregate_fn = ru_target_aggregation_fn[RUTarget.BOPS] + return aggregate_fn(nodes_bops.values()), nodes_bops + + def compute_node_bops(self, + n: BaseNode, + bitwidth_mode: BitwidthMode, + act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]] = None, + w_qc: Optional[NodeWeightsQuantizationConfig] = None) -> int: + """ + Compute Bit Operations of a node. + + Args: + n: node. + bitwidth_mode: bit-width mode for the computation. + act_qcs: nodes activation quantization configuration for custom bit mode. Must contain all configurable nodes. + w_qc: weights quantization config for the node for custom bit mode. Must be passed for configurable weights. + + Returns: + BOPS count. + """ + node_mac = self.fw_impl.get_node_mac_operations(n, self.fw_info) + if node_mac == 0 or bitwidth_mode == BitwidthMode.Size: + return node_mac + + incoming_edges = self.graph.incoming_edges(n, sort_by_attr=EDGE_SINK_INDEX) + # TODO temporary adding this for const_representation test in torch which has Linear with const input + if not incoming_edges: + return 0 + assert len(incoming_edges) == 1, \ + f'Unexpected number of inputs {len(incoming_edges)} for BOPS calculation. Expected 1.' + input_act_node = incoming_edges[0].source_node + act_qc = act_qcs.get(input_act_node) if act_qcs else None + a_nbits = self._get_activation_nbits(input_act_node, bitwidth_mode, act_qc) + + kernel_attrs = self.fw_info.get_kernel_op_attributes(n.type) + if len(kernel_attrs) > 1: + raise NotImplementedError('Multiple kernel attributes are not supported for BOPS computation.') + kernel_attr = kernel_attrs[0] + w_nbits = self._get_weight_nbits(n, kernel_attr, bitwidth_mode, w_qc) + + node_bops = a_nbits * w_nbits * node_mac + return node_bops + + @lru_cache + def _get_cut_target_nodes(self, cut: Cut, target_criterion: TargetInclusionCriterion) -> List[BaseNode]: + """ + Retrieve target nodes from a cut filtered by a criterion. + + Args: + cut: a graph cut. + target_criterion: criterion to include nodes for computation. + + Returns: + A list of target nodes from a cut. + """ + cut_nodes = [self.graph.find_node_by_name(e.node_name)[0] for e in cut.mem_elements.elements] + return self._get_target_activation_nodes(target_criterion, include_reused=True, nodes=cut_nodes) + + def _get_target_weight_nodes(self, + target_criterion: TargetInclusionCriterion, + include_reused: bool) -> List[BaseNode]: + """ + Collect nodes to include in weights utilization computation. + + Args: + target_criterion: criterion to include weights for computation. + include_reused: whether to include reused nodes. + + Returns: + Target nodes. + """ + if target_criterion == TargetInclusionCriterion.QConfigurable: + nodes = self.graph.get_weights_configurable_nodes(self.fw_info, include_reused_nodes=include_reused) + elif target_criterion == TargetInclusionCriterion.AnyQuantized: + nodes = [n for n in self.graph if n.has_any_weight_attr_to_quantize()] + elif target_criterion == TargetInclusionCriterion.QNonConfigurable: + # TODO this is wrong. Need to look at specific weights and not the whole node + quantized = [n for n in self.graph if n.has_any_weight_attr_to_quantize()] + configurable = self.graph.get_weights_configurable_nodes(self.fw_info, include_reused_nodes=include_reused) + nodes = [n for n in quantized if n not in configurable] + elif target_criterion == TargetInclusionCriterion.Any: + nodes = self.graph.nodes + else: + raise ValueError(f'Unknown {target_criterion}.') + + if not include_reused: + nodes = [n for n in nodes if not n.reuse] + return nodes + + def _get_target_weight_attrs(self, n: BaseNode, target_criterion: TargetInclusionCriterion) -> List[str]: + """ + Filter node's weight attributes per criterion. + + Args: + n: node. + target_criterion: selection criterion. + + Returns: + A list of selected weight attributes names. + """ + weight_attrs = n.get_node_weights_attributes() + if target_criterion == TargetInclusionCriterion.QConfigurable: + weight_attrs = [attr for attr in weight_attrs if n.is_configurable_weight(attr)] + elif target_criterion == TargetInclusionCriterion.AnyQuantized: + weight_attrs = [attr for attr in weight_attrs if n.is_weights_quantization_enabled(attr)] + elif target_criterion == TargetInclusionCriterion.QNonConfigurable: + quantized = [attr for attr in weight_attrs if n.is_weights_quantization_enabled(attr)] + configurable = [attr for attr in weight_attrs if n.is_configurable_weight(attr)] + weight_attrs = [attr for attr in quantized if attr not in configurable] + elif target_criterion != TargetInclusionCriterion.Any: + raise ValueError(f'Unknown {target_criterion}') + return weight_attrs + + def _topo_sort(self, nodes): + """ Sort nodes in a topological order (based on graph's nodes). """ + graph_topo_nodes = self.graph.get_topo_sorted_nodes() + topo_nodes = [n for n in graph_topo_nodes if n in nodes] + if len(topo_nodes) != len(nodes): + missing_nodes = [n for n in nodes if n not in topo_nodes] + raise ValueError(f'Could not topo-sort, nodes {missing_nodes} do not match the graph nodes.') + return topo_nodes + + def _get_target_activation_nodes(self, + target_criterion: TargetInclusionCriterion, + include_reused: bool, + nodes: Optional[List[BaseNode]] = None) -> List[BaseNode]: + """ + Collect nodes to include in activation utilization computation. + + Args: + target_criterion: inclusion for computation criteria. + include_reused: whether to include reused nodes. + nodes: nodes to filter target nodes from. By default, uses the graph nodes. + + Returns: + Target nodes. + """ + nodes = nodes or self.graph.nodes + if target_criterion == TargetInclusionCriterion.QConfigurable: + nodes = [n for n in nodes if n.has_configurable_activation()] + elif target_criterion == TargetInclusionCriterion.AnyQuantized: + nodes = [n for n in nodes if n.is_activation_quantization_enabled()] + elif target_criterion == TargetInclusionCriterion.QNonConfigurable: + nodes = [n for n in nodes if n.is_activation_quantization_enabled() and not n.has_configurable_activation()] + elif target_criterion != TargetInclusionCriterion.Any: + raise ValueError(f'Unknown {target_criterion}.') + if not include_reused: + nodes = [n for n in nodes if not n.reuse] + return nodes + + @staticmethod + def _get_activation_nbits(n: BaseNode, mode: BitwidthMode, qc: Optional[NodeActivationQuantizationConfig]) -> int: + """ + Get activation bit-width for a node with accordance to bit-width mode. + + Args: + n: node. + mode: bit-width mode for computation. + qc: quantization candidate for BitwidthMode.MpCustom mode. Can be skipped if the node has exactly one candidate. + + Returns: + Activation bit-width. + """ + if mode == BitwidthMode.Float: + return FLOAT_BITWIDTH + + if mode in _bitwidth_mode_fn: + candidates_nbits = [c.activation_quantization_cfg.activation_n_bits for c in n.candidates_quantization_cfg] + return _bitwidth_mode_fn[mode](candidates_nbits) + + if mode == BitwidthMode.MpCustom and qc: + return qc.activation_n_bits + + if mode in [BitwidthMode.MpCustom, BitwidthMode.SpDefault]: + qcs = n.get_unique_activation_candidates() + if len(qcs) != 1: + raise ValueError(f'Could not retrieve the default activation quantization candidate for node {n.name} ' + f'as it has {len(qcs)}!=1 unique candidates .') + return qcs[0].activation_quantization_cfg.activation_n_bits + + raise ValueError(f'Unknown mode {mode}') + + @staticmethod + def _get_weight_nbits(n, attr: str, bitwidth_mode: BitwidthMode, + w_qc: Optional[NodeWeightsQuantizationConfig]) -> int: + if bitwidth_mode == BitwidthMode.Float or not n.is_weights_quantization_enabled(attr): + return FLOAT_BITWIDTH + + if bitwidth_mode == BitwidthMode.MpCustom and w_qc and w_qc.has_attribute_config(attr): + return w_qc.get_attr_config(attr).weights_n_bits + + node_qcs = n.get_unique_weights_candidates(attr) + w_qcs = [qc.weights_quantization_cfg.get_attr_config(attr) for qc in node_qcs] + if bitwidth_mode in _bitwidth_mode_fn: + return _bitwidth_mode_fn[bitwidth_mode]([qc.weights_n_bits for qc in w_qcs]) + + if bitwidth_mode in [BitwidthMode.MpCustom, BitwidthMode.SpDefault]: + # if configuration was not passed and the weight has only one candidate, use it + if len(w_qcs) != 1: + raise ValueError(f'Could not retrieve the quantization candidate for attr {attr} of node {n.name} ' + f'as it {len(w_qcs)}!=1 unique candidates.') + return w_qcs[0].weights_n_bits + + raise ValueError(f'Unknown mode {bitwidth_mode.name}') diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py index a647a2cc5..52d3b8683 100644 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/resource_utilization_data.py @@ -13,21 +13,17 @@ # limitations under the License. # ============================================================================== import copy -from collections import defaultdict +from typing import Callable, Any -import numpy as np -from typing import Callable, Any, Dict, Tuple - -from model_compression_toolkit.logger import Logger -from model_compression_toolkit.constants import FLOAT_BITWIDTH, BITS_TO_BYTES from model_compression_toolkit.core import FrameworkInfo, ResourceUtilization, CoreConfig, QuantizationErrorMethod from model_compression_toolkit.core.common import Graph from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation -from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + RUTarget +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \ + ResourceUtilizationCalculator, BitwidthMode, TargetInclusionCriterion from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner from model_compression_toolkit.target_platform_capabilities.target_platform import TargetPlatformCapabilities -from model_compression_toolkit.target_platform_capabilities.schema.mct_current_schema import QuantizationConfigOptions -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import calc_graph_cuts def compute_resource_utilization_data(in_model: Any, @@ -37,7 +33,7 @@ def compute_resource_utilization_data(in_model: Any, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation, transformed_graph: Graph = None, - mixed_precision_enable: bool = True) -> ResourceUtilization: + mixed_precision_enabled: bool = True) -> ResourceUtilization: """ Compute Resource Utilization information that can be relevant for defining target ResourceUtilization for mixed precision search. Calculates maximal activation tensor size, the sum of the model's weight parameters and the total memory combining both weights @@ -53,7 +49,7 @@ def compute_resource_utilization_data(in_model: Any, fw_impl: FrameworkImplementation object with a specific framework methods implementation. transformed_graph: An internal graph representation of the input model. Defaults to None. If no graph is provided, a graph will be constructed using the specified model. - mixed_precision_enable: Indicates if mixed precision is enabled, defaults to True. + mixed_precision_enabled: Indicates if mixed precision is enabled, defaults to True. If disabled, computes resource utilization using base quantization configurations across all layers. @@ -72,174 +68,15 @@ def compute_resource_utilization_data(in_model: Any, fw_impl, tpc, bit_width_config=core_config.bit_width_config, - mixed_precision_enable=mixed_precision_enable) - - # Compute parameters sum - weights_memory_bytes, weights_params = compute_nodes_weights_params(graph=transformed_graph, fw_info=fw_info) - total_weights_params = 0 if len(weights_params) == 0 else sum(weights_params) - - # Compute max activation tensor - activation_output_sizes_bytes, activation_output_sizes = compute_activation_output_maxcut_sizes(graph=transformed_graph) - max_activation_tensor_size = 0 if len(activation_output_sizes) == 0 else max(activation_output_sizes) - - # Compute total memory utilization - parameters sum + max activation tensor - total_size = total_weights_params + max_activation_tensor_size - - # Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel - bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl) - bops_count = np.inf if len(bops_count) == 0 else sum(bops_count) - - return ResourceUtilization(weights_memory=total_weights_params, - activation_memory=max_activation_tensor_size, - total_memory=total_size, - bops=bops_count) - - -def compute_nodes_weights_params(graph: Graph, fw_info: FrameworkInfo) -> Tuple[np.ndarray, np.ndarray]: - """ - Calculates the memory usage in bytes and the number of weight parameters for each node within a graph. - Memory calculations are based on the maximum bit-width used for quantization per node. - - Args: - graph: A finalized Graph object, representing the model structure. - fw_info: FrameworkInfo object containing details about the specific framework's - quantization attributes for different layers' weights. - - Returns: - A tuple containing two arrays: - - The first array represents the memory in bytes for each node's weights when quantized at the maximal bit-width. - - The second array represents the total number of weight parameters for each node. - """ - weights_params = [] - weights_memory_bytes = [] - for n in graph.nodes: - # TODO: when enabling multiple attribute quantization by default (currently, - # only kernel quantization is enabled) we should include other attributes memory in the sum of all - # weights memory. - # When implementing this, we should just go over all attributes in the node instead of counting only kernels. - kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - if kernel_attr is not None and not n.reuse: - kernel_candidates = n.get_all_weights_attr_candidates(kernel_attr) - - if len(kernel_candidates) > 0 and any([c.enable_weights_quantization for c in kernel_candidates]): - max_weight_bits = max([kc.weights_n_bits for kc in kernel_candidates]) - node_num_weights_params = 0 - for attr in fw_info.get_kernel_op_attributes(n.type): - if attr is not None: - node_num_weights_params += n.get_weights_by_keys(attr).flatten().shape[0] - - weights_params.append(node_num_weights_params) - - # multiply num params by num bits and divide by BITS_TO_BYTES to convert from bits to bytes - weights_memory_bytes.append(node_num_weights_params * max_weight_bits / BITS_TO_BYTES) - - return np.array(weights_memory_bytes), np.array(weights_params) - - -def compute_activation_output_maxcut_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]: - """ - Computes an array of the respective output tensor maxcut size and an array of the output tensor - cut size in bytes for each cut. - - Args: - graph: A finalized Graph object, representing the model structure. - - Returns: - A tuple containing two arrays: - - The first is an array of the size of each activation max-cut size in bytes, calculated - using the maximal bit-width for quantization. - - The second array an array of the size of each activation max-cut activation size in number of parameters. - - """ - cuts = calc_graph_cuts(graph) - - # map nodes to cuts. - node_to_cat_mapping = defaultdict(list) - for i, cut in enumerate(cuts): - mem_element_names = [m.node_name for m in cut.mem_elements.elements] - for m_name in mem_element_names: - if len(graph.find_node_by_name(m_name)) > 0: - node_to_cat_mapping[m_name].append(i) - else: - Logger.critical(f"Missing node: {m_name}") # pragma: no cover + mixed_precision_enable=mixed_precision_enabled, + running_gptq=False) - activation_outputs = np.zeros(len(cuts)) - activation_outputs_bytes = np.zeros(len(cuts)) - for n in graph.nodes: - # Go over all nodes that have activation quantization enabled. - if n.has_activation_quantization_enabled_candidate(): - # Fetch maximum bits required for activations quantization. - max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg]) - node_output_size = n.get_total_output_params() - for cut_index in node_to_cat_mapping[n.name]: - activation_outputs[cut_index] += node_output_size - # Calculate activation size in bytes and append to list - activation_outputs_bytes[cut_index] += node_output_size * max_activation_bits / BITS_TO_BYTES - - return activation_outputs_bytes, activation_outputs - - -# TODO maxcut: add test for this function and remove no cover -def compute_activation_output_sizes(graph: Graph) -> Tuple[np.ndarray, np.ndarray]: # pragma: no cover - """ - Computes an array of the respective output tensor size and an array of the output tensor size in bytes for - each node. - - Args: - graph: A finalized Graph object, representing the model structure. - - Returns: - A tuple containing two arrays: - - The first array represents the size of each node's activation output tensor size in bytes, - calculated using the maximal bit-width for quantization. - - The second array represents the size of each node's activation output tensor size. - - """ - activation_outputs = [] - activation_outputs_bytes = [] - for n in graph.nodes: - # Go over all nodes that have configurable activation. - if n.has_activation_quantization_enabled_candidate(): - # Fetch maximum bits required for quantizing activations - max_activation_bits = max([qc.activation_quantization_cfg.activation_n_bits for qc in n.candidates_quantization_cfg]) - node_output_size = n.get_total_output_params() - activation_outputs.append(node_output_size) - # Calculate activation size in bytes and append to list - activation_outputs_bytes.append(node_output_size * max_activation_bits / BITS_TO_BYTES) - - return np.array(activation_outputs_bytes), np.array(activation_outputs) - - -def compute_total_bops(graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation) -> np.ndarray: - """ - Computes a vector with the respective Bit-operations count for each configurable node that includes MAC operations. - The computation assumes that the graph is a representation of a float model, thus, BOPs computation uses 32-bit. - - Args: - graph: Finalized Graph object. - fw_info: FrameworkInfo object about the specific framework - (e.g., attributes of different layers' weights to quantize). - fw_impl: FrameworkImplementation object with a specific framework methods implementation. - - Returns: A vector of nodes' Bit-operations count. - - """ - - bops = [] - - # Go over all configurable nodes that have kernels. - for n in graph.get_topo_sorted_nodes(): - if n.has_kernel_weight_to_quantize(fw_info): - # If node doesn't have weights then its MAC count is 0, and we shouldn't consider it in the BOPS count. - incoming_edges = graph.incoming_edges(n, sort_by_attr=EDGE_SINK_INDEX) - assert len(incoming_edges) == 1, f"Can't compute BOPS metric for node {n.name} with multiple inputs." - - node_mac = fw_impl.get_node_mac_operations(n, fw_info) - - node_bops = (FLOAT_BITWIDTH ** 2) * node_mac - bops.append(node_bops) - - return np.array(bops) + ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info) + ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, + BitwidthMode.Size, + metrics=set(RUTarget) - {RUTarget.BOPS}) + ru.bops, _ = ru_calculator.compute_bops(TargetInclusionCriterion.AnyQuantized, BitwidthMode.Float) + return ru def requires_mixed_precision(in_model: Any, @@ -268,7 +105,6 @@ def requires_mixed_precision(in_model: Any, Returns: A boolean indicating if mixed precision is needed. """ - is_mixed_precision = False core_config = _create_core_config_for_ru(core_config) transformed_graph = graph_preparation_runner(in_model, @@ -278,25 +114,14 @@ def requires_mixed_precision(in_model: Any, fw_impl, tpc, bit_width_config=core_config.bit_width_config, - mixed_precision_enable=False) - # Compute max weights memory in bytes - weights_memory_by_layer_bytes, _ = compute_nodes_weights_params(transformed_graph, fw_info) - total_weights_memory_bytes = 0 if len(weights_memory_by_layer_bytes) == 0 else sum(weights_memory_by_layer_bytes) - - # Compute max activation tensor in bytes - activation_memory_estimation_bytes, _ = compute_activation_output_maxcut_sizes(transformed_graph) - max_activation_memory_estimation_bytes = 0 if len(activation_memory_estimation_bytes) == 0 \ - else max(activation_memory_estimation_bytes) - - # Compute BOPS utilization - total count of bit-operations for all configurable layers with kernel - bops_count = compute_total_bops(graph=transformed_graph, fw_info=fw_info, fw_impl=fw_impl) - bops_count = np.inf if len(bops_count) == 0 else sum(bops_count) - - is_mixed_precision |= target_resource_utilization.weights_memory < total_weights_memory_bytes - is_mixed_precision |= target_resource_utilization.activation_memory < max_activation_memory_estimation_bytes - is_mixed_precision |= target_resource_utilization.total_memory < total_weights_memory_bytes + max_activation_memory_estimation_bytes - is_mixed_precision |= target_resource_utilization.bops < bops_count - return is_mixed_precision + mixed_precision_enable=False, + running_gptq=False) + + ru_calculator = ResourceUtilizationCalculator(transformed_graph, fw_impl, fw_info) + max_ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, + BitwidthMode.MpMax, + metrics=target_resource_utilization.get_restricted_metrics()) + return not target_resource_utilization.is_satisfied_by(max_ru) def _create_core_config_for_ru(core_config: CoreConfig) -> CoreConfig: diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py deleted file mode 100644 index 123ae4404..000000000 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_aggregation_methods.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2022 Sony Semiconductor Israel, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -import copy -from enum import Enum -from functools import partial -from typing import List, Any -import numpy as np - -from pulp import lpSum - - -def sum_ru_values(ru_vector: np.ndarray, set_constraints: bool = True) -> List[Any]: - """ - Aggregates resource utilization vector to a single resource utilization measure by summing all values. - - Args: - ru_vector: A vector with nodes' resource utilization values. - set_constraints: A flag for utilizing the method for resource utilization computation of a - given config not for LP formalization purposes. - - Returns: A list with an lpSum object for lp problem definition with the vector's sum. - - """ - if set_constraints: - return [lpSum(ru_vector)] - return [0] if len(ru_vector) == 0 else [sum(ru_vector)] - - - -def max_ru_values(ru_vector: np.ndarray, set_constraints: bool = True) -> List[float]: - """ - Aggregates resource utilization vector to allow max constraint in the linear programming problem formalization. - In order to do so, we need to define a separate constraint on each value in the resource utilization vector, - to be bounded by the target resource utilization. - - Args: - ru_vector: A vector with nodes' resource utilization values. - set_constraints: A flag for utilizing the method for resource utilization computation of a - given config not for LP formalization purposes. - - Returns: A list with the vector's values, to be used to define max constraint - in the linear programming problem formalization. - - """ - if set_constraints: - return [ru for ru in ru_vector] - return [0] if len(ru_vector) == 0 else [max(ru_vector)] - - - -def total_ru(ru_tensor: np.ndarray, set_constraints: bool = True) -> List[float]: - """ - Aggregates resource utilization vector to allow weights and activation total utilization constraint in the linear programming - problem formalization. In order to do so, we need to define a separate constraint on each activation memory utilization value in - the resource utilization vector, combined with the sum weights memory utilization. - Note that the given ru_tensor should contain weights and activation utilization values in each entry. - - Args: - ru_tensor: A tensor with nodes' resource utilization values for weights and activation. - set_constraints: A flag for utilizing the method for resource utilization computation of a - given config not for LP formalization purposes. - - Returns: A list with lpSum objects, to be used to define total constraint - in the linear programming problem formalization. - - """ - if set_constraints: - weights_ru = lpSum([ru[0] for ru in ru_tensor]) - return [weights_ru + activation_ru for _, activation_ru in ru_tensor] - else: - weights_ru = sum([ru[0] for ru in ru_tensor]) - activation_ru = max([ru[1] for ru in ru_tensor]) - return [weights_ru + activation_ru] - - -class MpRuAggregation(Enum): - """ - Defines resource utilization aggregation functions that can be used to compute final resource utilization metric. - The enum values can be used to call a function on a set of arguments. - - SUM - applies the sum_ru_values function - - MAX - applies the max_ru_values function - - TOTAL - applies the total_ru function - - """ - SUM = partial(sum_ru_values) - MAX = partial(max_ru_values) - TOTAL = partial(total_ru) - - def __call__(self, *args): - return self.value(*args) diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py deleted file mode 100644 index 86c4a3f86..000000000 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_functions_mapping.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2022 Sony Semiconductor Israel, Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -from typing import NamedTuple - -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import RUTarget -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_aggregation_methods import MpRuAggregation -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import MpRuMetric - - -# When adding a RUTarget that we want to consider in our mp search, -# a matching pair of resource_utilization_tools computation function and a resource_utilization_tools -# aggregation function should be added to this dictionary -class RuFunctions(NamedTuple): - metric_fn: MpRuMetric - aggregate_fn: MpRuAggregation - - -ru_functions_mapping = {RUTarget.WEIGHTS: RuFunctions(MpRuMetric.WEIGHTS_SIZE, MpRuAggregation.SUM), - RUTarget.ACTIVATION: RuFunctions(MpRuMetric.ACTIVATION_MAXCUT_SIZE, MpRuAggregation.MAX), - RUTarget.TOTAL: RuFunctions(MpRuMetric.TOTAL_WEIGHTS_ACTIVATION_SIZE, MpRuAggregation.TOTAL), - RUTarget.BOPS: RuFunctions(MpRuMetric.BOPS_COUNT, MpRuAggregation.SUM)} diff --git a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py index b75bf1232..b73051e8a 100644 --- a/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py +++ b/model_compression_toolkit/core/common/mixed_precision/resource_utilization_tools/ru_methods.py @@ -12,389 +12,186 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -from enum import Enum -from functools import partial -from typing import List, Optional -from copy import deepcopy +from typing import List, Set, Dict, Optional, Tuple import numpy as np from model_compression_toolkit.core import FrameworkInfo from model_compression_toolkit.core.common import Graph, BaseNode -from model_compression_toolkit.constants import BITS_TO_BYTES, FLOAT_BITWIDTH from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation -from model_compression_toolkit.core.common.graph.edge import EDGE_SINK_INDEX +from model_compression_toolkit.core.common.graph.memory_graph.cut import Cut from model_compression_toolkit.core.common.graph.virtual_activation_weights_node import VirtualActivationWeightsNode, \ VirtualSplitWeightsNode, VirtualSplitActivationNode -from model_compression_toolkit.core.common.graph.memory_graph.memory_graph import MemoryGraph -from model_compression_toolkit.core.common.graph.memory_graph.compute_graph_max_cut import compute_graph_max_cut, Cut -from model_compression_toolkit.logger import Logger +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + RUTarget +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \ + ResourceUtilizationCalculator, BitwidthMode, TargetInclusionCriterion +from model_compression_toolkit.core.common.quantization.node_quantization_config import NodeWeightsQuantizationConfig, \ + NodeActivationQuantizationConfig + + +# TODO take into account Virtual nodes. Are candidates defined with respect to virtual or original nodes? +# Can we use the virtual graph only for bops and the original graph for everything else? + +class MixPrecisionRUHelper: + def __init__(self, graph: Graph, fw_info: FrameworkInfo, fw_impl: FrameworkImplementation): + self.graph = graph + self.fw_info = fw_info + self.fw_impl = fw_impl + self.ru_calculator = ResourceUtilizationCalculator(graph, fw_impl, fw_info) + + def compute_utilization(self, ru_targets: Set[RUTarget], mp_cfg: Optional[List[int]]) -> Dict[RUTarget, np.ndarray]: + """ + Compute utilization of requested targets for a specific configuration + + Args: + ru_targets: resource utilization targets to compute. + mp_cfg: a list of candidates indices for configurable layers. + + Returns: + Dict of the computed utilization per target. + """ + + ru = {} + + act_qcs, w_qcs = self.get_configurable_qcs(mp_cfg) if mp_cfg else (None, None) + w_util = None + if RUTarget.WEIGHTS in ru_targets: + w_util = self._weights_utilization(w_qcs) + ru[RUTarget.WEIGHTS] = np.array(list(w_util.values())) + + # TODO make mp agnostic to activation method + if RUTarget.ACTIVATION in ru_targets: + act_util = self._activation_maxcut_utilization(act_qcs) + ru[RUTarget.ACTIVATION] = np.array(list(act_util.values())) + + # TODO use maxcut + if RUTarget.TOTAL in ru_targets: + act_tensors_util = self._activation_tensor_utilization(act_qcs) + w_util = w_util or self._weights_utilization(w_qcs) + total = {n: (w_util.get(n, 0), act_tensors_util.get(n, 0)) + for n in self.graph.nodes if n in act_tensors_util or n in w_util} + ru[RUTarget.TOTAL] = np.array(list(total.values())) + + if RUTarget.BOPS in ru_targets: + ru[RUTarget.BOPS] = self._bops_utilization(mp_cfg) + + return ru + + def get_configurable_qcs(self, mp_cfg) \ + -> Tuple[Dict[BaseNode, NodeActivationQuantizationConfig], Dict[BaseNode, NodeWeightsQuantizationConfig]]: + """ + Retrieve quantization candidates objects for weights and activations from the configuration list. + + Args: + mp_cfg: a list of candidates indices for configurable layers. + + Returns: + Mapping between nodes to weights quantization config, and a mapping between nodes and activation + quantization config. + """ + mp_nodes = self.graph.get_configurable_sorted_nodes(self.fw_info) + node_qcs = {n: n.candidates_quantization_cfg[mp_cfg[i]] for i, n in enumerate(mp_nodes)} + act_qcs = {n: node_qcs[n].activation_quantization_cfg + for n in self.graph.get_activation_configurable_nodes()} + w_qcs = {n: node_qcs[n].weights_quantization_cfg + for n in self.graph.get_weights_configurable_nodes(self.fw_info)} + return act_qcs, w_qcs + + def _weights_utilization(self, w_qcs: Optional[Dict[BaseNode, NodeWeightsQuantizationConfig]]) -> Dict[BaseNode, float]: + """ + Compute weights utilization for configurable weights if configuration is passed, + or for non-configurable nodes otherwise. + + Args: + w_qcs: nodes quantization configuration to compute, or None. + + Returns: + Weight utilization per node. + """ + if w_qcs: + target_criterion = TargetInclusionCriterion.QConfigurable + bitwidth_mode = BitwidthMode.MpCustom + else: + target_criterion = TargetInclusionCriterion.QNonConfigurable + bitwidth_mode = BitwidthMode.SpDefault + + _, nodes_util, _ = self.ru_calculator.compute_weights_utilization(target_criterion=target_criterion, + bitwidth_mode=bitwidth_mode, + w_qcs=w_qcs) + nodes_util = {n: u.bytes for n, u in nodes_util.items()} + return nodes_util + + def _activation_maxcut_utilization(self, act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]) \ + -> Optional[Dict[Cut, float]]: + """ + Compute activation utilization using MaxCut for all quantized nodes if configuration is passed. + + Args: + act_qcs: nodes activation configuration or None. + + Returns: + Activation utilization per cut, or empty dict if no configuration was passed. + """ + if act_qcs: + _, cuts_util, _ = self.ru_calculator.compute_cut_activation_utilization(TargetInclusionCriterion.AnyQuantized, + bitwidth_mode=BitwidthMode.MpCustom, + act_qcs=act_qcs) + cuts_util = {c: u.bytes for c, u in cuts_util.items()} + return cuts_util - -def weights_size_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation) -> np.ndarray: - """ - Computes a resource utilization vector with the respective weights' memory size for the given weight configurable node, - according to the given mixed-precision configuration. - If an empty configuration is given, then computes resource utilization vector for non-configurable nodes. - - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize). - fw_impl: FrameworkImplementation object with specific framework methods implementation (not used in this method). - - Returns: A vector of node's weights memory sizes. - Note that the vector is not necessarily of the same length as the given config. - - """ - weights_memory = [] - mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info) - weights_mp_nodes = [n.name for n in graph.get_sorted_weights_configurable_nodes(fw_info)] - - if len(mp_cfg) == 0: - # Computing non-configurable nodes resource utilization - # TODO: when enabling multiple attribute quantization by default (currently, - # only kernel quantization is enabled) we should include other attributes memory in the sum of all - # weights memory (when quantized to their default 8-bit, non-configurable). - # When implementing this, we should just go over all attributes in the node instead of counting only kernels. - for n in graph.nodes: - kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - if kernel_attr is None: - continue - non_configurable_node = n.name not in weights_mp_nodes \ - and not n.reuse \ - and n.is_all_weights_candidates_equal(kernel_attr) - - if non_configurable_node: - node_nbits = (n.candidates_quantization_cfg[0].weights_quantization_cfg - .get_attr_config(kernel_attr).weights_n_bits) - node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_nbits, fw_info) - weights_memory.append(node_weights_memory_in_bytes) - else: - # Go over configurable all nodes that should be taken into consideration when computing the weights - # resource utilization. - for n in graph.get_sorted_weights_configurable_nodes(fw_info): - # Only nodes with kernel op can be considered configurable - kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - node_idx = mp_nodes.index(n.name) - node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]] - node_nbits = node_qc.weights_quantization_cfg.get_attr_config(kernel_attr).weights_n_bits - - node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_nbits, fw_info) - - weights_memory.append(node_weights_memory_in_bytes) - - return np.array(weights_memory) - - -def calc_graph_cuts(graph: Graph) -> List[Cut]: - """ - Calculate graph activation cuts. - Args: - graph: A graph object to calculate activation cuts on. - - Returns: - A list of activation cuts. - - """ - memory_graph = MemoryGraph(deepcopy(graph)) - _, _, cuts = compute_graph_max_cut(memory_graph) - - if cuts is None: - Logger.critical("Failed to calculate activation memory cuts for graph.") # pragma: no cover - # filter empty cuts and cuts that contain only nodes with activation quantization disabled. - filtered_cuts = [] - for cut in cuts: - cut_has_no_act_quant_nodes = any( - [graph.find_node_by_name(e.node_name)[0].has_activation_quantization_enabled_candidate() - for e in cut.mem_elements.elements]) - if len(cut.mem_elements.elements) > 0 and cut_has_no_act_quant_nodes: - filtered_cuts.append(cut) - return filtered_cuts - - -def activation_maxcut_size_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation, - cuts: Optional[List[Cut]] = None) -> np.ndarray: - """ - Computes a resource utilization vector with the respective output memory max-cut size for activation - nodes, according to the given mixed-precision configuration. - - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize) - (not used in this method). - fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method). - cuts: a list of graph cuts (optional. if not provided calculated locally). - TODO maxcut: refactor - need to remove the cuts so all metric functions signatures are the same. - - Returns: A vector of node's cut memory sizes. - Note that the vector is not necessarily of the same length as the given config. - - """ - if len(mp_cfg) == 0: # Computing non-configurable nodes resource utilization for max-cut is included in the calculation of the # configurable nodes. - return np.array([]) - - activation_cut_memory = [] - mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info) - # Go over all nodes that should be taken into consideration when computing the weights memory utilization. - nodes_act_nbits = {} - for n in graph.get_sorted_activation_configurable_nodes(): - node_idx = mp_nodes.index(n.name) - node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]] - node_nbits = node_qc.activation_quantization_cfg.activation_n_bits - nodes_act_nbits[n.name] = node_nbits - - if cuts is None: - cuts = calc_graph_cuts(graph) - - for i, cut in enumerate(cuts): - mem_elements = [m.node_name for m in cut.mem_elements.elements] - mem = 0 - for op_name in mem_elements: - n = graph.find_node_by_name(op_name)[0] - if n.is_activation_quantization_enabled(): - base_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits - mem += _compute_node_activation_memory(n, nodes_act_nbits.get(op_name, base_nbits)) - - activation_cut_memory.append(mem) - - return np.array(activation_cut_memory) - - -# TODO maxcut: add test for this function and remove no cover -def activation_output_size_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation) -> np.ndarray: # pragma: no cover - """ - Computes a resource utilization vector with the respective output memory size for each activation configurable node, - according to the given mixed-precision configuration. - If an empty configuration is given, then computes resource utilization vector for non-configurable nodes. - - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize) - (not used in this method). - fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method). - - Returns: A vector of node's activation memory sizes. - Note that the vector is not necessarily of the same length as the given config. - - """ - activation_memory = [] - mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info) - activation_mp_nodes = [n.name for n in graph.get_sorted_activation_configurable_nodes()] - - if len(mp_cfg) == 0: - # Computing non-configurable nodes resource utilization - for n in graph.nodes: - non_configurable_node = n.name not in activation_mp_nodes \ - and n.has_activation_quantization_enabled_candidate() \ - and n.is_all_activation_candidates_equal() - - if non_configurable_node: - node_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits - node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits) - activation_memory.append(node_activation_memory_in_bytes) - else: - # Go over all nodes that should be taken into consideration when computing the weights memory utilization. - for n in graph.get_sorted_activation_configurable_nodes(): - node_idx = mp_nodes.index(n.name) - node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]] - node_nbits = node_qc.activation_quantization_cfg.activation_n_bits - - node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits) - - activation_memory.append(node_activation_memory_in_bytes) + return {} - return np.array(activation_memory) + def _activation_tensor_utilization(self, act_qcs: Optional[Dict[BaseNode, NodeActivationQuantizationConfig]]): + """ + Compute activation tensors utilization fo configurable nodes if configuration is passed or + for non-configurable nodes otherwise. + Args: + act_qcs: activation quantization configuration or None. -def total_weights_activation_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation) -> np.ndarray: - """ - Computes resource utilization tensor with the respective weights size and output memory size for each activation configurable node, - according to the given mixed-precision configuration. - If an empty configuration is given, then computes resource utilization vector for non-configurable nodes. - - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize) - (not used in this method). - fw_impl: FrameworkImplementation object with specific framework methods implementation(not used in this method). - - Returns: A 2D tensor of nodes' weights memory sizes and activation output memory size. - Note that the vector is not necessarily of the same length as the given config. + Returns: + Activation utilization per node. + """ + if act_qcs: + target_criterion = TargetInclusionCriterion.QConfigurable + bitwidth_mode = BitwidthMode.MpCustom + else: + target_criterion = TargetInclusionCriterion.QNonConfigurable + bitwidth_mode = BitwidthMode.SpDefault - """ - weights_activation_memory = [] - weights_mp_nodes = [n.name for n in graph.get_sorted_weights_configurable_nodes(fw_info)] - activation_mp_nodes = [n.name for n in graph.get_sorted_activation_configurable_nodes()] - - if len(mp_cfg) == 0: - # Computing non-configurable nodes utilization - for n in graph.nodes: - - non_configurable = False - node_weights_memory_in_bytes, node_activation_memory_in_bytes = 0, 0 - - # Non-configurable Weights - # TODO: currently considering only kernel attributes in weights memory utilization. - # When enabling multi-attribute quantization we need to modify this method to count all attributes. - kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - if kernel_attr is not None: - is_non_configurable_weights = n.name not in weights_mp_nodes and \ - n.is_all_weights_candidates_equal(kernel_attr) and \ - not n.reuse - - if is_non_configurable_weights: - node_nbits = (n.candidates_quantization_cfg[0].weights_quantization_cfg - .get_attr_config(kernel_attr).weights_n_bits) - node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_nbits, fw_info) - non_configurable = True - - # Non-configurable Activation - is_non_configurable_activation = n.name not in activation_mp_nodes and \ - n.has_activation_quantization_enabled_candidate() and \ - n.is_all_activation_candidates_equal() - - if is_non_configurable_activation: - node_nbits = n.candidates_quantization_cfg[0].activation_quantization_cfg.activation_n_bits - node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_nbits) - non_configurable = True - - if non_configurable: - weights_activation_memory.append( - np.array([node_weights_memory_in_bytes, node_activation_memory_in_bytes])) - else: - # Go over all nodes that should be taken into consideration when computing the weights or - # activation memory utilization (all configurable nodes). - for node_idx, n in enumerate(graph.get_configurable_sorted_nodes(fw_info)): - # TODO: currently considering only kernel attributes in weights memory utilization. When enabling multi-attribute - # quantization we need to modify this method to count all attributes. - - node_qc = n.candidates_quantization_cfg[mp_cfg[node_idx]] - - # Compute node's weights memory (if no weights to quantize then set to 0) - node_weights_memory_in_bytes = 0 - kernel_attr = fw_info.get_kernel_op_attributes(n.type)[0] - if kernel_attr is not None: - if n.is_weights_quantization_enabled(kernel_attr) and not n.is_all_weights_candidates_equal(kernel_attr): - node_weights_nbits = node_qc.weights_quantization_cfg.get_attr_config(kernel_attr).weights_n_bits - node_weights_memory_in_bytes = _compute_node_weights_memory(n, node_weights_nbits, fw_info) - - # Compute node's activation memory (if node's activation are not being quantized then set to 0) - node_activation_nbits = node_qc.activation_quantization_cfg.activation_n_bits - node_activation_memory_in_bytes = 0 - if n.is_activation_quantization_enabled() and not n.is_all_activation_candidates_equal(): - node_activation_memory_in_bytes = _compute_node_activation_memory(n, node_activation_nbits) - - weights_activation_memory.append(np.array([node_weights_memory_in_bytes, node_activation_memory_in_bytes])) - - return np.array(weights_activation_memory) - - -def bops_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation, - set_constraints: bool = True) -> np.ndarray: - """ - Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node, - according to the given mixed-precision configuration of a virtual graph with composed nodes. + _, nodes_util = self.ru_calculator.compute_activation_tensors_utilization(target_criterion=target_criterion, + bitwidth_mode=bitwidth_mode, + act_qcs=act_qcs) + return {n: u.bytes for n, u in nodes_util.items()} - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize). - fw_impl: FrameworkImplementation object with specific framework methods implementation. - set_constraints: A flag for utilizing the method for resource utilization computation of a - given config not for LP formalization purposes. + def _bops_utilization(self, mp_cfg: List[int]): + """ + Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node, + according to the given mixed-precision configuration of a virtual graph with composed nodes. - Returns: A vector of node's BOPS count. - Note that the vector is not necessarily of the same length as the given config. + Args: + mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - """ + Returns: A vector of node's BOPS count. + Note that the vector is not necessarily of the same length as the given config. - if not set_constraints: - return _bops_utilization(mp_cfg, - graph, - fw_info, - fw_impl) + """ + # TODO keeping old implementation for now - # BOPs utilization method considers non-configurable nodes, therefore, it doesn't need separate implementation - # for non-configurable nodes for setting a constraint (no need for separate implementation for len(mp_cfg) = 0). + # BOPs utilization method considers non-configurable nodes, therefore, it doesn't need separate implementation + # for non-configurable nodes for setting a constraint (no need for separate implementation for len(mp_cfg) = 0). - virtual_bops_nodes = [n for n in graph.get_topo_sorted_nodes() if isinstance(n, VirtualActivationWeightsNode)] + virtual_bops_nodes = [n for n in self.graph.get_topo_sorted_nodes() if isinstance(n, VirtualActivationWeightsNode)] - mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info) - bops = [n.get_bops_count(fw_impl, fw_info, candidate_idx=_get_node_cfg_idx(n, mp_cfg, mp_nodes)) for n in virtual_bops_nodes] + mp_nodes = self.graph.get_configurable_sorted_nodes_names(self.fw_info) - return np.array(bops) + bops = [n.get_bops_count(self.fw_impl, self.fw_info, candidate_idx=_get_node_cfg_idx(n, mp_cfg, mp_nodes)) + for n in virtual_bops_nodes] - -def _bops_utilization(mp_cfg: List[int], - graph: Graph, - fw_info: FrameworkInfo, - fw_impl: FrameworkImplementation) -> np.ndarray: - """ - Computes a resource utilization vector with the respective bit-operations (BOPS) count for each configurable node, - according to the given mixed-precision configuration of an original graph. - - Args: - mp_cfg: A mixed-precision configuration (list of candidates index for each configurable node) - graph: Graph object. - fw_info: FrameworkInfo object about the specific framework (e.g., attributes of different layers' weights to quantize). - fw_impl: FrameworkImplementation object with specific framework methods implementation. - - Returns: A vector of node's BOPS count. - - """ - - mp_nodes = graph.get_configurable_sorted_nodes_names(fw_info) - - # Go over all nodes that should be taken into consideration when computing the BOPS utilization. - bops = [] - for n in graph.get_topo_sorted_nodes(): - if n.has_kernel_weight_to_quantize(fw_info) and not n.has_positional_weights: - # If node doesn't have weights then its MAC count is 0, and we shouldn't consider it in the BOPS count. - incoming_edges = graph.incoming_edges(n, sort_by_attr=EDGE_SINK_INDEX) - if len(incoming_edges) != 1: - Logger.critical(f"Unable to compute BOPS metric for node {n.name} due to multiple inputs.") # pragma: no cover - input_activation_node = incoming_edges[0].source_node - if len(graph.out_edges(input_activation_node)) > 1: - # In the case where the activation node has multiple outgoing edges - # we don't consider this edge in the BOPS utilization calculation - continue - - input_activation_node_cfg = input_activation_node.candidates_quantization_cfg[_get_node_cfg_idx(input_activation_node, mp_cfg, mp_nodes)] - - node_mac = fw_impl.get_node_mac_operations(n, fw_info) - - node_qc = n.candidates_quantization_cfg[_get_node_cfg_idx(n, mp_cfg, mp_nodes)] - kenrel_node_qc = node_qc.weights_quantization_cfg.get_attr_config(fw_info.get_kernel_op_attributes(n.type)[0]) - node_weights_nbits = kenrel_node_qc.weights_n_bits if \ - kenrel_node_qc.enable_weights_quantization else FLOAT_BITWIDTH - input_activation_nbits = input_activation_node_cfg.activation_quantization_cfg.activation_n_bits if \ - input_activation_node_cfg.activation_quantization_cfg.enable_activation_quantization else FLOAT_BITWIDTH - - node_bops = node_weights_nbits * input_activation_nbits * node_mac - bops.append(node_bops) - - return np.array(bops) + return np.array(bops) def _get_node_cfg_idx(node: BaseNode, mp_cfg: List[int], sorted_configurable_nodes_names: List[str]) -> int: @@ -458,71 +255,3 @@ def _get_origin_activation_node(n: BaseNode) -> BaseNode: return n.origin_node return n - - -def _compute_node_weights_memory(n: BaseNode, node_nbits: int, fw_info: FrameworkInfo) -> float: - """ - Computes the weights' memory of the given node. - - Args: - n: A node to compute its weights' memory. - node_nbits: A bit-width in which the node's weights should be quantized. - fw_info: FrameworkInfo object about the specific framework. - - Returns: The total memory of the node's weights when quantized to the given bit-width. - - """ - - origin_node = _get_origin_weights_node(n) - - node_num_weights_params = 0 - for attr in fw_info.get_kernel_op_attributes(origin_node.type): - if attr is not None: - node_num_weights_params += origin_node.get_weights_by_keys(attr).flatten().shape[0] - - return node_num_weights_params * node_nbits / BITS_TO_BYTES - - -def _compute_node_activation_memory(n: BaseNode, node_nbits: int) -> float: - """ - Computes the activation tensor memory of the given node. - - Args: - n: A node to compute its activation tensor memory. - node_nbits: A bit-width in which the node's weights should be quantized. - - Returns: The total memory of the node's activation tensor when quantized to the given bit-width. - - """ - - origin_node = _get_origin_activation_node(n) - node_output_size = origin_node.get_total_output_params() - - return node_output_size * node_nbits / BITS_TO_BYTES - - -class MpRuMetric(Enum): - """ - Defines resource utilization computation functions that can be used to compute bops_utilization for a given target - for a given mp config. The enum values can be used to call a function on a set of arguments. - - WEIGHTS_SIZE - applies the weights_size_utilization function - - ACTIVATION_MAXCUT_SIZE - applies the activation_maxcut_size_utilization function. - - ACTIVATION_OUTPUT_SIZE - applies the activation_output_size_utilization function - - TOTAL_WEIGHTS_ACTIVATION_SIZE - applies the total_weights_activation_utilization function - - BOPS_COUNT - applies the bops_utilization function - - """ - - WEIGHTS_SIZE = partial(weights_size_utilization) - ACTIVATION_MAXCUT_SIZE = partial(activation_maxcut_size_utilization) - ACTIVATION_OUTPUT_SIZE = partial(activation_output_size_utilization) - TOTAL_WEIGHTS_ACTIVATION_SIZE = partial(total_weights_activation_utilization) - BOPS_COUNT = partial(bops_utilization) - - def __call__(self, *args): - return self.value(*args) diff --git a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py index 1576c48ad..a85d65378 100644 --- a/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py +++ b/model_compression_toolkit/core/common/mixed_precision/search_methods/linear_programming.py @@ -14,10 +14,13 @@ # ============================================================================== import numpy as np +import pulp from pulp import * from tqdm import tqdm from typing import Dict, List, Tuple, Callable +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \ + ru_target_aggregation_fn, AggregationMethod from model_compression_toolkit.logger import Logger from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_manager import MixedPrecisionSearchManager @@ -218,13 +221,11 @@ def _add_set_of_ru_constraints(search_manager: MixedPrecisionSearchManager, np.sum(indicated_ru_matrix[i], axis=0) + # sum of metric values over all configurations in a row search_manager.min_ru[target][i] for i in range(indicated_ru_matrix.shape[0])]) - # search_manager.compute_ru_functions contains a pair of ru_metric and ru_aggregation for each ru target - # get aggregated ru, considering both configurable and non-configurable nodes - if non_conf_ru_vector is None or len(non_conf_ru_vector) == 0: - aggr_ru = search_manager.compute_ru_functions[target].aggregate_fn(ru_sum_vector) - else: - aggr_ru = search_manager.compute_ru_functions[target].aggregate_fn(np.concatenate([ru_sum_vector, non_conf_ru_vector])) + ru_vec = ru_sum_vector + if non_conf_ru_vector is not None and non_conf_ru_vector.size: + ru_vec = np.concatenate([ru_vec, non_conf_ru_vector]) + aggr_ru = _aggregate_for_lp(ru_vec, target) for v in aggr_ru: if isinstance(v, float): if v > target_resource_utilization_value: @@ -235,6 +236,21 @@ def _add_set_of_ru_constraints(search_manager: MixedPrecisionSearchManager, lp_problem += v <= target_resource_utilization_value +def _aggregate_for_lp(ru_vec, target) -> list: + if target == RUTarget.TOTAL: + w = pulp.lpSum(v[0] for v in ru_vec) + return [w + v[1] for v in ru_vec] + + if ru_target_aggregation_fn[target] == AggregationMethod.SUM: + return [pulp.lpSum(ru_vec)] + + if ru_target_aggregation_fn[target] == AggregationMethod.MAX: + return list(ru_vec) + + raise NotImplementedError(f'Cannot define lp constraints with unsupported aggregation function ' + f'{ru_target_aggregation_fn[target]}') # pragma: no cover + + def _build_layer_to_metrics_mapping(search_manager: MixedPrecisionSearchManager, target_resource_utilization: ResourceUtilization, eps: float = EPS) -> Dict[int, Dict[int, float]]: diff --git a/model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py b/model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py index 4020d1350..99a50068c 100644 --- a/model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py +++ b/model_compression_toolkit/core/common/mixed_precision/sensitivity_evaluation.py @@ -113,11 +113,9 @@ def __init__(self, # in the new built MP model. self.baseline_model, self.model_mp, self.conf_node2layers = self._build_models() - # Build images batches for inference comparison - self.images_batches = self._get_images_batches(quant_config.num_of_images) - - # Casting images tensors to the framework tensor type. - self.images_batches = [self.fw_impl.to_tensor(img) for img in self.images_batches] + # Build images batches for inference comparison and cat to framework type + images_batches = self._get_images_batches(quant_config.num_of_images) + self.images_batches = [self.fw_impl.to_tensor(img) for img in images_batches] # Initiating baseline_tensors_list since it is not initiated in SensitivityEvaluationManager init. self.baseline_tensors_list = self._init_baseline_tensors_list() diff --git a/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py b/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py index a9a1f9d6e..8b3c35597 100644 --- a/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py +++ b/model_compression_toolkit/core/common/mixed_precision/solution_refinement_procedure.py @@ -80,8 +80,8 @@ def greedy_solution_refinement_procedure(mp_solution: List[int], updated_ru.append(node_updated_ru) # filter out new configs that don't hold the resource utilization restrictions - node_filtered_ru = [(node_idx, ru) for node_idx, ru in zip(valid_candidates, updated_ru) if - target_resource_utilization.holds_constraints(ru)] + node_filtered_ru = [(node_idx, ru) for node_idx, ru in zip(valid_candidates, updated_ru) + if target_resource_utilization.is_satisfied_by(ru)] if len(node_filtered_ru) > 0: sorted_by_ru = sorted(node_filtered_ru, key=lambda node_ru: (node_ru[1].total_memory, diff --git a/model_compression_toolkit/core/common/quantization/bit_width_config.py b/model_compression_toolkit/core/common/quantization/bit_width_config.py index e057f0c54..887d828e1 100644 --- a/model_compression_toolkit/core/common/quantization/bit_width_config.py +++ b/model_compression_toolkit/core/common/quantization/bit_width_config.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +from dataclasses import dataclass, field from typing import List, Union, Dict from model_compression_toolkit.core.common import Graph @@ -19,6 +20,7 @@ from model_compression_toolkit.logger import Logger +@dataclass class ManualBitWidthSelection: """ Class to encapsulate the manual bit width selection configuration for a specific filter. @@ -27,13 +29,11 @@ class ManualBitWidthSelection: filter (BaseNodeMatcher): The filter used to select nodes for bit width manipulation. bit_width (int): The bit width to be applied to the selected nodes. """ - def __init__(self, - filter: BaseNodeMatcher, - bit_width: int): - self.filter = filter - self.bit_width = bit_width + filter: BaseNodeMatcher + bit_width: int +@dataclass class BitWidthConfig: """ Class to manage manual bit-width configurations. @@ -41,13 +41,7 @@ class BitWidthConfig: Attributes: manual_activation_bit_width_selection_list (List[ManualBitWidthSelection]): A list of ManualBitWidthSelection objects defining manual bit-width configurations. """ - def __init__(self, - manual_activation_bit_width_selection_list: List[ManualBitWidthSelection] = None): - self.manual_activation_bit_width_selection_list = [] if manual_activation_bit_width_selection_list is None else manual_activation_bit_width_selection_list - - def __repr__(self): - # Used for debugging, thus no cover. - return str(self.__dict__) # pragma: no cover + manual_activation_bit_width_selection_list: List[ManualBitWidthSelection] = field(default_factory=list) def set_manual_activation_bit_width(self, filters: Union[List[BaseNodeMatcher], BaseNodeMatcher], diff --git a/model_compression_toolkit/core/runner.py b/model_compression_toolkit/core/runner.py index 1948f28c2..83ad5e363 100644 --- a/model_compression_toolkit/core/runner.py +++ b/model_compression_toolkit/core/runner.py @@ -12,44 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -from collections import namedtuple import copy - -from typing import Callable, Tuple, Any, List, Dict - import numpy as np +from typing import Callable, Any, List from model_compression_toolkit.core.common import FrameworkInfo +from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation from model_compression_toolkit.core.common.fusion.graph_fuser import GraphFuser - +from model_compression_toolkit.core.common.graph.base_graph import Graph from model_compression_toolkit.core.common.graph.memory_graph.compute_graph_max_cut import compute_graph_max_cut, \ SchedulerInfo from model_compression_toolkit.core.common.graph.memory_graph.memory_graph import MemoryGraph from model_compression_toolkit.core.common.hessian.hessian_info_service import HessianInfoService +from model_compression_toolkit.core.common.mixed_precision.bit_width_setter import set_bit_widths from model_compression_toolkit.core.common.mixed_precision.mixed_precision_candidates_filter import \ filter_candidates_for_mixed_precision +from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_facade import search_bit_width +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import \ + ResourceUtilization +from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_calculator import \ + ResourceUtilizationCalculator, TargetInclusionCriterion, BitwidthMode from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization_data import \ requires_mixed_precision -from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner -from model_compression_toolkit.core.quantization_prep_runner import quantization_preparation_runner -from model_compression_toolkit.logger import Logger -from model_compression_toolkit.core.common.framework_implementation import FrameworkImplementation -from model_compression_toolkit.core.common.graph.base_graph import Graph -from model_compression_toolkit.core.common.mixed_precision.bit_width_setter import set_bit_widths -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.resource_utilization import ResourceUtilization, RUTarget -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_aggregation_methods import MpRuAggregation -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_functions_mapping import ru_functions_mapping -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_methods import MpRuMetric -from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_facade import search_bit_width from model_compression_toolkit.core.common.network_editors.edit_network import edit_network_graph from model_compression_toolkit.core.common.quantization.core_config import CoreConfig -from model_compression_toolkit.target_platform_capabilities.target_platform.targetplatform2framework import TargetPlatformCapabilities -from model_compression_toolkit.core.common.visualization.final_config_visualizer import \ - WeightsFinalBitwidthConfigVisualizer, \ - ActivationFinalBitwidthConfigVisualizer from model_compression_toolkit.core.common.visualization.tensorboard_writer import TensorboardWriter, \ finalize_bitwidth_in_tb +from model_compression_toolkit.core.graph_prep_runner import graph_preparation_runner +from model_compression_toolkit.core.quantization_prep_runner import quantization_preparation_runner +from model_compression_toolkit.logger import Logger +from model_compression_toolkit.target_platform_capabilities.target_platform.targetplatform2framework import \ + TargetPlatformCapabilities def core_runner(in_model: Any, @@ -88,7 +82,7 @@ def core_runner(in_model: Any, """ # Warn is representative dataset has batch-size == 1 - batch_data = iter(representative_data_gen()).__next__() + batch_data = next(iter(representative_data_gen())) if isinstance(batch_data, list): batch_data = batch_data[0] if batch_data.shape[0] == 1: @@ -96,7 +90,7 @@ def core_runner(in_model: Any, ' consider increasing the batch size') # Checking whether to run mixed precision quantization - if target_resource_utilization is not None: + if target_resource_utilization is not None and target_resource_utilization.is_any_restricted(): if core_config.mixed_precision_config is None: Logger.critical("Provided an initialized target_resource_utilization, that means that mixed precision quantization is " "enabled, but the provided MixedPrecisionQuantizationConfig is None.") @@ -177,7 +171,6 @@ def core_runner(in_model: Any, _set_final_resource_utilization(graph=tg, final_bit_widths_config=bit_widths_config, - ru_functions_dict=ru_functions_mapping, fw_info=fw_info, fw_impl=fw_impl) @@ -215,7 +208,6 @@ def core_runner(in_model: Any, def _set_final_resource_utilization(graph: Graph, final_bit_widths_config: List[int], - ru_functions_dict: Dict[RUTarget, Tuple[MpRuMetric, MpRuAggregation]], fw_info: FrameworkInfo, fw_impl: FrameworkImplementation): """ @@ -225,39 +217,15 @@ def _set_final_resource_utilization(graph: Graph, Args: graph: Graph to compute the resource utilization for. final_bit_widths_config: The final bit-width configuration to quantize the model accordingly. - ru_functions_dict: A mapping between a RUTarget and a pair of resource utilization method and resource utilization aggregation functions. fw_info: A FrameworkInfo object. fw_impl: FrameworkImplementation object with specific framework methods implementation. """ - - final_ru_dict = {} - for ru_target, ru_funcs in ru_functions_dict.items(): - ru_method, ru_aggr = ru_funcs - if ru_target == RUTarget.BOPS: - final_ru_dict[ru_target] = \ - ru_aggr(ru_method(final_bit_widths_config, graph, fw_info, fw_impl, False), False)[0] - else: - non_conf_ru = ru_method([], graph, fw_info, fw_impl) - conf_ru = ru_method(final_bit_widths_config, graph, fw_info, fw_impl) - if len(final_bit_widths_config) > 0 and len(non_conf_ru) > 0: - final_ru_dict[ru_target] = ru_aggr(np.concatenate([conf_ru, non_conf_ru]), False)[0] - elif len(final_bit_widths_config) > 0 and len(non_conf_ru) == 0: - final_ru_dict[ru_target] = ru_aggr(conf_ru, False)[0] - elif len(final_bit_widths_config) == 0 and len(non_conf_ru) > 0: - # final_bit_widths_config == 0 ==> no configurable nodes, - # thus, ru can be computed from non_conf_ru alone - final_ru_dict[ru_target] = ru_aggr(non_conf_ru, False)[0] - else: - # No relevant nodes have been quantized with affect on the given target - since we only consider - # in the model's final size the quantized layers size, this means that the final size for this target - # is zero. - Logger.warning(f"No relevant quantized layers for the ru target {ru_target} were found, the recorded " - f"final ru for this target would be 0.") - final_ru_dict[ru_target] = 0 - - final_ru = ResourceUtilization() - final_ru.set_resource_utilization_by_target(final_ru_dict) + w_qcs = {n: n.final_weights_quantization_cfg for n in graph.nodes} + a_qcs = {n: n.final_activation_quantization_cfg for n in graph.nodes} + ru_calculator = ResourceUtilizationCalculator(graph, fw_impl, fw_info) + final_ru = ru_calculator.compute_resource_utilization(TargetInclusionCriterion.AnyQuantized, BitwidthMode.MpCustom, + act_qcs=a_qcs, w_qcs=w_qcs) print(final_ru) graph.user_info.final_resource_utilization = final_ru graph.user_info.mixed_precision_cfg = final_bit_widths_config diff --git a/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision/requires_mixed_precision_test.py b/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision/requires_mixed_precision_test.py index a82ddd149..7a8dd0b08 100644 --- a/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision/requires_mixed_precision_test.py +++ b/tests/keras_tests/feature_networks_tests/feature_networks/mixed_precision/requires_mixed_precision_test.py @@ -78,7 +78,7 @@ def get_max_resources_for_model(self, model): fw_info=DEFAULT_KERAS_INFO, fw_impl=KerasImplementation(), transformed_graph=None, - mixed_precision_enable=False) + mixed_precision_enabled=False) def get_quantization_config(self): return mct.core.QuantizationConfig(mct.core.QuantizationErrorMethod.MSE, diff --git a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py index 85c41581c..1e118d36f 100644 --- a/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py +++ b/tests/keras_tests/non_parallel_tests/test_lp_search_bitwidth.py @@ -24,8 +24,6 @@ MixedPrecisionQuantizationConfig from model_compression_toolkit.core.common.mixed_precision.mixed_precision_search_facade import search_bit_width, \ BitWidthSearchMethod -from model_compression_toolkit.core.common.mixed_precision.resource_utilization_tools.ru_functions_mapping import \ - RuFunctions from model_compression_toolkit.core.common.mixed_precision.search_methods.linear_programming import \ mp_integer_programming_search from model_compression_toolkit.core.common.model_collector import ModelCollector @@ -68,10 +66,6 @@ def __init__(self, layer_to_ru_mapping): RUTarget.TOTAL: [[2], [2], [2]], RUTarget.BOPS: [[1], [1], [1]]} # minimal resource utilization in the tests layer_to_ru_mapping - self.compute_ru_functions = {RUTarget.WEIGHTS: RuFunctions(None, lambda v: [lpSum(v)]), - RUTarget.ACTIVATION: RuFunctions(None, lambda v: [i for i in v]), - RUTarget.TOTAL: RuFunctions(None, lambda v: [lpSum(v[0]) + i for i in v[1]]), - RUTarget.BOPS: RuFunctions(None, lambda v: [lpSum(v)])} self.max_ru_config = [0] self.config_reconstruction_helper = MockReconstructionHelper() self.non_conf_ru_dict = None diff --git a/tests_pytest/core/__init__.py b/tests_pytest/core/__init__.py new file mode 100644 index 000000000..e11a7cc60 --- /dev/null +++ b/tests_pytest/core/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== diff --git a/tests_pytest/core/common/__init__.py b/tests_pytest/core/common/__init__.py new file mode 100644 index 000000000..e11a7cc60 --- /dev/null +++ b/tests_pytest/core/common/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== diff --git a/tests_pytest/core/common/mixed_precision/__init__.py b/tests_pytest/core/common/mixed_precision/__init__.py new file mode 100644 index 000000000..e11a7cc60 --- /dev/null +++ b/tests_pytest/core/common/mixed_precision/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== diff --git a/tests_pytest/core/common/mixed_precision/resource_utilization_tools/__init__.py b/tests_pytest/core/common/mixed_precision/resource_utilization_tools/__init__.py new file mode 100644 index 000000000..e11a7cc60 --- /dev/null +++ b/tests_pytest/core/common/mixed_precision/resource_utilization_tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== diff --git a/tests_pytest/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py b/tests_pytest/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py new file mode 100644 index 000000000..a052dd12f --- /dev/null +++ b/tests_pytest/core/common/mixed_precision/resource_utilization_tools/test_resource_utilization_calculator.py @@ -0,0 +1,51 @@ +# Copyright 2024 Sony Semiconductor Israel, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import numpy as np + +from model_compression_toolkit.core.common import Graph, BaseNode + + +class TestUtilization: + pass + +def generate_node(): + rng = np.random.default_rng(seed=42) + BaseNode(name='node', + framework_attr={}, + input_shape=(2, 3), + output_shape=(4, 5, 6), + weights={'weight1': rng.random((2, 5, 7)), + 'weight2': rng.random((11,)), + 'weight3': rng.random((10, 3))}, + layer_class: Mock(), + reuse=False, + reuse_group=None, + inputs_as_list=False, + quantization_attr: Dict[str, Any] = None, + has_activation: bool = True, + is_custom: bool = False + ): +) +def generate_graph(): + g = Graph('g', + nodes: List[BaseNode], + input_nodes: List[BaseNode], + output_nodes: List[OutTensor], + edge_list: List[Edge], + fw_info: FrameworkInfo = None, + **attr) + +class TestRUCalculator: + def test_compute_node_weights(self):