small fixes

sony · Oct 6, 2024 · 4b84a90 · 4b84a90
1 parent 19cad97
commit 4b84a90
Show file tree

Hide file tree

Showing 8 changed files with 88 additions and 90 deletions.
diff --git a/model_compression_toolkit/core/common/hessian/hessian_info_service.py b/model_compression_toolkit/core/common/hessian/hessian_info_service.py
@@ -279,8 +279,7 @@ def calc_image_hash(image):
     def fetch_hessian(self,
                       hessian_scores_request: HessianScoresRequest,
                       required_size: int,
-                      batch_size: int = 1,
-                      per_sample_hash: bool = False) -> List[List[np.ndarray]]:
+                      batch_size: int = 1) -> List[List[np.ndarray]]:
         """
         Fetches the computed approximations of the Hessian-based scores for the given
         request and required size.
@@ -289,14 +288,14 @@ def fetch_hessian(self,
             hessian_scores_request: Configuration for which to fetch the approximation.
             required_size: Number of approximations required.
             batch_size: The Hessian computation batch size.
-            per_sample_hash: Whether to compute hessian per sample hash.
 
         Returns:
             List[List[np.ndarray]]: For each target node, returns a list of computed approximations.
             The outer list is per image (thus, has the length as required_size).
             The inner list length dependent on the granularity (1 for per-tensor, 
             OC for per-output-channel when the requested node has OC output-channels, etc.)
         """
+
         if len(hessian_scores_request.target_nodes) == 0:
             return []
 
@@ -312,32 +311,6 @@ def fetch_hessian(self,
             for node in hessian_scores_request.target_nodes
         ]
 
-        if per_sample_hash:
-            if required_size is not None:
-                raise ValueError('required_size cannot be specified with per_sample_hash')
-
-            def gen_single(orig_gen):
-                # convert original generator into generator that yields sample by sample
-                for batch in orig_gen:
-                    for i in range(batch[0].shape[0]):
-                        yield [inp[i] for inp in batch]
-
-            def gen_new_batch():
-                # convert sample by sample generator into the required batch
-                samples = []
-                for sample in gen_single(self.representative_dataset_gen()):
-                    samples.append(sample)
-                    if len(samples) == batch_size:
-                        yield [np.stack(d, axis=0) for d in zip(*samples)]
-                        samples = []
-                if samples:
-                    yield [np.stack(d, axis=0) for d in zip(*samples)]
-
-            return self._compute_trackable_per_sample_hessian(hessian_scores_request, gen_new_batch())
-
-        if required_size is None:
-            raise ValueError('required_size must be specified if per_sample_hash is False')
-
         # Ensure the saved info has the required number of approximations
         self._populate_saved_info_to_size(hessian_scores_request, required_size, batch_size)
 

diff --git a/..._compression_toolkit/core/pytorch/hessian/activation_hessian_scores_calculator_pytorch.py b/..._compression_toolkit/core/pytorch/hessian/activation_hessian_scores_calculator_pytorch.py
@@ -93,12 +93,25 @@ def forward_pass(self):
         output = self.concat_tensors(output_tensors)
         return output, target_activation_tensors
 
-    def _generate_random_vector(self, shape, distribution: HessianEstimationDistribution, device):
+    def _generate_random_vectors_batch(self, shape, distribution: HessianEstimationDistribution, device) -> torch.Tensor:
+        """
+        Generate a batch of random vectors for Hutchinson estimation
+
+        Args:
+            shape: target shape
+            distribution: distribution to sample from
+            device: target device
+
+        Returns:
+            Random tensor
+        """
         if distribution == HessianEstimationDistribution.GAUSSIAN:
             return torch.randn(shape, device=device)
 
         if distribution == HessianEstimationDistribution.RADEMACHER:
-            return torch.where(torch.randint(0, 2, shape, device=device).to(torch.bool), 1, -1).to(device)
+            v = torch.randint(high=2, size=shape, device=device)
+            v[v == 0] = -1
+            return v
 
         raise ValueError(f'Unknown distribution {distribution}')
 
@@ -116,7 +129,7 @@ def compute(self) -> List[np.ndarray]:
         elif self.hessian_request.granularity == HessianScoresGranularity.PER_OUTPUT_CHANNEL:
             hessian_scores = self._compute_per_channel(output, target_activation_tensors)
         else:
-            raise NotImplementedError(f'{HessianScoresGranularity.PER_ELEMENT} is not supported')
+            raise NotImplementedError(f'{self.hessian_request.granularity} is not supported')
 
         # Convert results to list of numpy arrays
         hessian_results = [torch_tensor_to_numpy(h) for h in hessian_scores]
@@ -129,7 +142,7 @@ def _compute_per_tensor(self, output, target_activation_tensors):
         prev_mean_results = None
         for j in tqdm(range(self.num_iterations_for_approximation), "Hessian random iterations"):  # Approximation iterations
             # Getting a random vector with normal distribution
-            v = self._generate_random_vector(output.shape, self.hessian_request.distribution, output.device)
+            v = self._generate_random_vectors_batch(output.shape, self.hessian_request.distribution, output.device)
             f_v = torch.sum(v * output)
             for i, ipt_tensor in enumerate(target_activation_tensors):  # Per Interest point activation tensor
                 # Computing the hessian-approximation scores by getting the gradient of (output * v)
@@ -170,19 +183,16 @@ def _compute_per_channel(self, output, target_activation_tensors):
                                       for _ in range(len(target_activation_tensors))]
 
         for j in tqdm(range(self.num_iterations_for_approximation), "Hessian random iterations"):  # Approximation iterations
-            # Getting a random vector with normal distribution
-            v = self._generate_random_vector(output.shape, self.hessian_request.distribution, output.device)
+            v = self._generate_random_vectors_batch(output.shape, self.hessian_request.distribution, output.device)
             f_v = torch.sum(v * output)
             for i, ipt_tensor in enumerate(target_activation_tensors):  # Per Interest point activation tensor
-                # Computing the hessian-approximation scores by getting the gradient of (output * v)
                 hess_v = autograd.grad(outputs=f_v,
                                        inputs=ipt_tensor,
                                        retain_graph=True)[0]
-
                 hessian_approx_scores = hess_v ** 2
                 rank = len(hess_v.shape)
                 if rank > 2:
-                    hessian_approx_scores = torch.mean(hess_v, dim=tuple(range(2, rank)))
+                    hessian_approx_scores = torch.mean(hessian_approx_scores, dim=tuple(range(2, rank)))
 
                 # Update node Hessian approximation mean over random iterations
                 ipts_hessian_approx_scores[i] = (j * ipts_hessian_approx_scores[i] + hessian_approx_scores) / (j + 1)

diff --git a/model_compression_toolkit/gptq/common/gptq_training.py b/model_compression_toolkit/gptq/common/gptq_training.py
@@ -144,8 +144,7 @@ def compute_hessian_based_weights(self) -> np.ndarray:
             return np.asarray([1 / num_nodes for _ in range(num_nodes)])
 
         # Fetch hessian approximations for each target node
-        # TODO this smells like a bug. In hessian calculation target nodes are topo sorted and results are returned
-        # in the same order. Maybe topo sort doesn't do anything and it works?
+        # TODO this smells like a potential bug. In hessian calculation target nodes are topo sorted and results are returned
         # TODO also target nodes are replaced for reuse. Does this work correctly?
         approximations = self._fetch_hessian_approximations(HessianScoresGranularity.PER_TENSOR)
         compare_point_to_hessian_approx_scores = {node: score for node, score in zip(self.compare_points, approximations)}
@@ -182,25 +181,26 @@ def _compute_sample_layer_attention_scores(self, inputs_batch) -> Dict[str, Dict
         """
         Compute sample layer attention scores per image hash per layer.
 
+        Args:
+            inputs_batch: a list containing a batch of inputs.
+
         Returns:
-            A dictionary {img_hash: {layer: score}} where score is the
+            A dictionary with a structure {img_hash: {layer: score}}.
 
         """
         request = self._build_hessian_request(HessianScoresGranularity.PER_OUTPUT_CHANNEL)
         hessian_batch_size = self.gptq_config.hessian_weights_config.hessian_batch_size
 
         hessian_score_per_image_per_layer = {}
-        # TODO Is it really needed if we compute on the fly per batch? Also if hessian batch is larger its ignored.
-        # If hessian batch is smaller than inputs batch, split it to hessian batches.
+        # If hessian batch is smaller than inputs batch, split it to hessian batches. If hessian batch is larger,
+        # it's currently ignored (TODO)
         for i in range(0, inputs_batch[0].shape[0], hessian_batch_size):
             inputs = [t[i: i+hessian_batch_size] for t in inputs_batch]
             hessian_score_per_image_per_layer.update(
                 self.hessian_service.compute_trackable_per_sample_hessian(request, inputs)
             )
-        # hessian_score_per_image_per_layer = self._fetch_hessian_approximations(HessianScoresGranularity.PER_OUTPUT_CHANNEL)
-        for layers_score in hessian_score_per_image_per_layer.values():
-            for k, t in layers_score.items():
-                layers_score[k] = t.max(axis=0)    # layer score is (channels,)
+        for img_hash, v in hessian_score_per_image_per_layer.items():
+            hessian_score_per_image_per_layer[img_hash] = {k: t.max(axis=0) for k, t in v.items()}
         return hessian_score_per_image_per_layer
 
     def _fetch_hessian_approximations(self, granularity: HessianScoresGranularity) -> Dict[BaseNode, List[List[float]]]:
@@ -215,8 +215,7 @@ def _fetch_hessian_approximations(self, granularity: HessianScoresGranularity) -
         node_approximations = self.hessian_service.fetch_hessian(
             hessian_scores_request=hessian_scores_request,
             required_size=self.gptq_config.hessian_weights_config.hessians_num_samples,
-            batch_size=self.gptq_config.hessian_weights_config.hessian_batch_size,
-            per_sample_hash=self.gptq_config.hessian_weights_config.per_sample
+            batch_size=self.gptq_config.hessian_weights_config.hessian_batch_size
         )
         return node_approximations
 

diff --git a/model_compression_toolkit/gptq/pytorch/gptq_loss.py b/model_compression_toolkit/gptq/pytorch/gptq_loss.py
@@ -79,7 +79,7 @@ def sample_layer_attention_loss(y_list: List[torch.Tensor],
         y_list: First list of tensors.
         x_list: Second list of tensors.
         fxp_w_list, flp_w_list, act_bn_mean, act_bn_std: unused (needed to comply with the interface).
-        loss_weights: A list of weights for each layer. Each weight is a vector of shape (batch,)
+        loss_weights: layer-sample weights tensor of shape (layers, batch)
 
     Returns:
         Sample Layer Attention loss (a scalar).
@@ -88,14 +88,12 @@ def sample_layer_attention_loss(y_list: List[torch.Tensor],
     layers_mean_w = []
 
     for i, (y, x, w) in enumerate(zip(y_list, x_list, loss_weights)):
-        # norm = (y - x).pow(2).sum(1)
-        norm = (y - x).pow(2).mean(1)
+        norm = (y - x).pow(2).sum(1)
         if len(norm.shape) > 1:
             norm = norm.flatten(1).mean(1)
         loss += torch.mean(w * norm)
         layers_mean_w.append(w.mean())
 
-    # loss = loss / len(x_list)
     loss = loss / torch.stack(layers_mean_w).max()
     return loss
 
diff --git a/model_compression_toolkit/gptq/pytorch/gptq_training.py b/model_compression_toolkit/gptq/pytorch/gptq_training.py
@@ -106,17 +106,16 @@ def _get_total_grad_steps():
                                                                   trainable_bias,
                                                                   trainable_threshold)
         hessian_cfg = self.gptq_config.hessian_weights_config
-        self.weights_for_average_loss = None    # for fixed layer weights
+        self.use_sample_layer_attention = hessian_cfg.per_sample
+        self.hessian_score_per_layer = None    # for fixed layer weights
         self.hessian_score_per_image_per_layer = None    # for sample-layer attention
-        if hessian_cfg.per_sample:
+        if self.use_sample_layer_attention:
             assert (hessian_cfg.norm_scores is False and hessian_cfg.log_norm is False and
                     hessian_cfg.scale_log_norm is False), hessian_cfg
-            # TODO if a representative dataset is fixed (same images in each epoch) we can precalculate.
-            # However if images differ between epochs, we have to calculate their hessians each time and pre-calculation
-            # will be a waste. Currently it is calculated on-demand during the training loop.
+            # Per sample hessian scores are calculated on-demand during the training loop
             self.hessian_score_per_image_per_layer = {}
         else:
-            self.weights_for_average_loss = to_torch_tensor(self.compute_hessian_based_weights())
+            self.hessian_score_per_layer = to_torch_tensor(self.compute_hessian_based_weights())
 
         self.reg_func = get_regularization(self.gptq_config, _get_total_grad_steps)
 
@@ -221,14 +220,16 @@ def train(self, representative_data_gen: Callable):
     def compute_gradients(self,
                           y_float: List[torch.Tensor],
                           input_tensors: List[torch.Tensor],
-                          weights_for_average_loss) -> Tuple[torch.Tensor, List[np.ndarray]]:
+                          distill_loss_weights: torch.Tensor,
+                          round_reg_weights: torch.Tensor) -> Tuple[torch.Tensor, List[np.ndarray]]:
         """
         Get outputs from both teacher and student networks. Compute the observed error,
         and use it to compute the gradients and applying them to the student weights.
         Args:
             y_float: A list of reference tensor from the floating point network.
             input_tensors: A list of Input tensors to pass through the networks.
-            weights_for_average_loss: Weights for loss. Either per layer, or per layer per sample.
+            distill_loss_weights: Weights for the distillation loss.
+            round_reg_weights: Weight for the rounding regularization loss.
         Returns:
             Loss and gradients.
         """
@@ -243,9 +244,8 @@ def compute_gradients(self,
                                            self.flp_weights_list,
                                            self.compare_points_mean,
                                            self.compare_points_std,
-                                           weights_for_average_loss)
-
-        reg_value = self.reg_func(self.fxp_model, self.gptq_config.regularization_factor)
+                                           distill_loss_weights)
+        reg_value = self.reg_func(self.fxp_model, self.gptq_config.regularization_factor, round_reg_weights)
 
         loss_value += reg_value
 
@@ -273,11 +273,11 @@ def micro_training_loop(self,
             for _ in epochs_pbar:
                 with tqdm(data_function(), position=1, leave=False) as data_pbar:
                     for data in data_pbar:
-                        weights = to_torch_tensor(self._get_samples_weights_for_loss(data))
+                        distill_weights, reg_weights = to_torch_tensor(self._get_loss_weights(data))
                         input_data = [d * self.input_scale for d in data]
                         input_tensor = to_torch_tensor(input_data)
                         y_float = self.float_model(input_tensor)  # running float model
-                        loss_value, grads = self.compute_gradients(y_float, input_tensor, weights)
+                        loss_value, grads = self.compute_gradients(y_float, input_tensor, distill_weights, reg_weights)
                         # Run one step of gradient descent by updating the value of the variables to minimize the loss.
                         for (optimizer, _) in self.optimizer_with_param:
                             optimizer.step()
@@ -289,13 +289,22 @@ def micro_training_loop(self,
                         self.loss_list.append(loss_value.item())
                         Logger.debug(f'last loss value: {self.loss_list[-1]}')
 
-    # TODO move to common after ctor refactor
-    def _get_samples_weights_for_loss(self, input_tensors: List[torch.Tensor]):
-        if self.weights_for_average_loss is not None:
-            assert self.hessian_score_per_image_per_layer is None
-            return self.weights_for_average_loss
+    def _get_loss_weights(self, input_tensors: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Fetches weights for distillation and round regularization parts of loss.
+
+        Args:
+            input_tensors: list containing a batch of inputs.
+
+        Returns:
+            A tuple of two tensors:
+            - weights for distillation loss
+            - weights for rounding regularization loss
+
+        """
+        if self.use_sample_layer_attention is False:
+            return self.hessian_score_per_layer, torch.ones_like(self.hessian_score_per_layer)
 
-        # assert self.hessian_score_per_image_per_layer
         if len(input_tensors) > 1:
             raise NotImplementedError('Sample-Layer attention is not currently supported for networks with multiple inputs')
 
@@ -309,7 +318,10 @@ def _get_samples_weights_for_loss(self, input_tensors: List[torch.Tensor]):
             img_scores_per_layer: Dict[BaseNode, np.ndarray] = self.hessian_score_per_image_per_layer[img_hash]
             img_scores = np.stack(list(img_scores_per_layer.values()), axis=0)
             scores.append(img_scores)
-        return np.stack(scores, axis=1)    # layers X images
+
+        layer_sample_weights = np.stack(scores, axis=1)    # layers X images
+        layer_weights = layer_sample_weights.mean(axis=1)
+        return layer_sample_weights, layer_weights
 
     def update_graph(self) -> Graph:
         """

diff --git a/model_compression_toolkit/gptq/pytorch/quantizer/regularization_factory.py b/model_compression_toolkit/gptq/pytorch/quantizer/regularization_factory.py
@@ -41,4 +41,4 @@ def get_regularization(gptq_config: GradientPTQConfig, get_total_grad_steps_fn:
         scheduler = LinearAnnealingScheduler(t_start=t_start, t_end=total_gradient_steps, initial_val=20, target_val=2)
         return SoftQuantizerRegularization(scheduler)
     else:
-        return lambda m, e_reg: 0
+        return lambda *args, **kwargs: 0