Skip to content

Commit

Permalink
small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
irenaby committed Oct 6, 2024
1 parent 19cad97 commit 4b84a90
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 90 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,7 @@ def calc_image_hash(image):
def fetch_hessian(self,
hessian_scores_request: HessianScoresRequest,
required_size: int,
batch_size: int = 1,
per_sample_hash: bool = False) -> List[List[np.ndarray]]:
batch_size: int = 1) -> List[List[np.ndarray]]:
"""
Fetches the computed approximations of the Hessian-based scores for the given
request and required size.
Expand All @@ -289,14 +288,14 @@ def fetch_hessian(self,
hessian_scores_request: Configuration for which to fetch the approximation.
required_size: Number of approximations required.
batch_size: The Hessian computation batch size.
per_sample_hash: Whether to compute hessian per sample hash.
Returns:
List[List[np.ndarray]]: For each target node, returns a list of computed approximations.
The outer list is per image (thus, has the length as required_size).
The inner list length dependent on the granularity (1 for per-tensor,
OC for per-output-channel when the requested node has OC output-channels, etc.)
"""

if len(hessian_scores_request.target_nodes) == 0:
return []

Expand All @@ -312,32 +311,6 @@ def fetch_hessian(self,
for node in hessian_scores_request.target_nodes
]

if per_sample_hash:
if required_size is not None:
raise ValueError('required_size cannot be specified with per_sample_hash')

def gen_single(orig_gen):
# convert original generator into generator that yields sample by sample
for batch in orig_gen:
for i in range(batch[0].shape[0]):
yield [inp[i] for inp in batch]

def gen_new_batch():
# convert sample by sample generator into the required batch
samples = []
for sample in gen_single(self.representative_dataset_gen()):
samples.append(sample)
if len(samples) == batch_size:
yield [np.stack(d, axis=0) for d in zip(*samples)]
samples = []
if samples:
yield [np.stack(d, axis=0) for d in zip(*samples)]

return self._compute_trackable_per_sample_hessian(hessian_scores_request, gen_new_batch())

if required_size is None:
raise ValueError('required_size must be specified if per_sample_hash is False')

# Ensure the saved info has the required number of approximations
self._populate_saved_info_to_size(hessian_scores_request, required_size, batch_size)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,25 @@ def forward_pass(self):
output = self.concat_tensors(output_tensors)
return output, target_activation_tensors

def _generate_random_vector(self, shape, distribution: HessianEstimationDistribution, device):
def _generate_random_vectors_batch(self, shape, distribution: HessianEstimationDistribution, device) -> torch.Tensor:
"""
Generate a batch of random vectors for Hutchinson estimation
Args:
shape: target shape
distribution: distribution to sample from
device: target device
Returns:
Random tensor
"""
if distribution == HessianEstimationDistribution.GAUSSIAN:
return torch.randn(shape, device=device)

if distribution == HessianEstimationDistribution.RADEMACHER:
return torch.where(torch.randint(0, 2, shape, device=device).to(torch.bool), 1, -1).to(device)
v = torch.randint(high=2, size=shape, device=device)
v[v == 0] = -1
return v

raise ValueError(f'Unknown distribution {distribution}')

Expand All @@ -116,7 +129,7 @@ def compute(self) -> List[np.ndarray]:
elif self.hessian_request.granularity == HessianScoresGranularity.PER_OUTPUT_CHANNEL:
hessian_scores = self._compute_per_channel(output, target_activation_tensors)
else:
raise NotImplementedError(f'{HessianScoresGranularity.PER_ELEMENT} is not supported')
raise NotImplementedError(f'{self.hessian_request.granularity} is not supported')

# Convert results to list of numpy arrays
hessian_results = [torch_tensor_to_numpy(h) for h in hessian_scores]
Expand All @@ -129,7 +142,7 @@ def _compute_per_tensor(self, output, target_activation_tensors):
prev_mean_results = None
for j in tqdm(range(self.num_iterations_for_approximation), "Hessian random iterations"): # Approximation iterations
# Getting a random vector with normal distribution
v = self._generate_random_vector(output.shape, self.hessian_request.distribution, output.device)
v = self._generate_random_vectors_batch(output.shape, self.hessian_request.distribution, output.device)
f_v = torch.sum(v * output)
for i, ipt_tensor in enumerate(target_activation_tensors): # Per Interest point activation tensor
# Computing the hessian-approximation scores by getting the gradient of (output * v)
Expand Down Expand Up @@ -170,19 +183,16 @@ def _compute_per_channel(self, output, target_activation_tensors):
for _ in range(len(target_activation_tensors))]

for j in tqdm(range(self.num_iterations_for_approximation), "Hessian random iterations"): # Approximation iterations
# Getting a random vector with normal distribution
v = self._generate_random_vector(output.shape, self.hessian_request.distribution, output.device)
v = self._generate_random_vectors_batch(output.shape, self.hessian_request.distribution, output.device)
f_v = torch.sum(v * output)
for i, ipt_tensor in enumerate(target_activation_tensors): # Per Interest point activation tensor
# Computing the hessian-approximation scores by getting the gradient of (output * v)
hess_v = autograd.grad(outputs=f_v,
inputs=ipt_tensor,
retain_graph=True)[0]

hessian_approx_scores = hess_v ** 2
rank = len(hess_v.shape)
if rank > 2:
hessian_approx_scores = torch.mean(hess_v, dim=tuple(range(2, rank)))
hessian_approx_scores = torch.mean(hessian_approx_scores, dim=tuple(range(2, rank)))

# Update node Hessian approximation mean over random iterations
ipts_hessian_approx_scores[i] = (j * ipts_hessian_approx_scores[i] + hessian_approx_scores) / (j + 1)
Expand Down
21 changes: 10 additions & 11 deletions model_compression_toolkit/gptq/common/gptq_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ def compute_hessian_based_weights(self) -> np.ndarray:
return np.asarray([1 / num_nodes for _ in range(num_nodes)])

# Fetch hessian approximations for each target node
# TODO this smells like a bug. In hessian calculation target nodes are topo sorted and results are returned
# in the same order. Maybe topo sort doesn't do anything and it works?
# TODO this smells like a potential bug. In hessian calculation target nodes are topo sorted and results are returned
# TODO also target nodes are replaced for reuse. Does this work correctly?
approximations = self._fetch_hessian_approximations(HessianScoresGranularity.PER_TENSOR)
compare_point_to_hessian_approx_scores = {node: score for node, score in zip(self.compare_points, approximations)}
Expand Down Expand Up @@ -182,25 +181,26 @@ def _compute_sample_layer_attention_scores(self, inputs_batch) -> Dict[str, Dict
"""
Compute sample layer attention scores per image hash per layer.
Args:
inputs_batch: a list containing a batch of inputs.
Returns:
A dictionary {img_hash: {layer: score}} where score is the
A dictionary with a structure {img_hash: {layer: score}}.
"""
request = self._build_hessian_request(HessianScoresGranularity.PER_OUTPUT_CHANNEL)
hessian_batch_size = self.gptq_config.hessian_weights_config.hessian_batch_size

hessian_score_per_image_per_layer = {}
# TODO Is it really needed if we compute on the fly per batch? Also if hessian batch is larger its ignored.
# If hessian batch is smaller than inputs batch, split it to hessian batches.
# If hessian batch is smaller than inputs batch, split it to hessian batches. If hessian batch is larger,
# it's currently ignored (TODO)
for i in range(0, inputs_batch[0].shape[0], hessian_batch_size):
inputs = [t[i: i+hessian_batch_size] for t in inputs_batch]
hessian_score_per_image_per_layer.update(
self.hessian_service.compute_trackable_per_sample_hessian(request, inputs)
)
# hessian_score_per_image_per_layer = self._fetch_hessian_approximations(HessianScoresGranularity.PER_OUTPUT_CHANNEL)
for layers_score in hessian_score_per_image_per_layer.values():
for k, t in layers_score.items():
layers_score[k] = t.max(axis=0) # layer score is (channels,)
for img_hash, v in hessian_score_per_image_per_layer.items():
hessian_score_per_image_per_layer[img_hash] = {k: t.max(axis=0) for k, t in v.items()}
return hessian_score_per_image_per_layer

def _fetch_hessian_approximations(self, granularity: HessianScoresGranularity) -> Dict[BaseNode, List[List[float]]]:
Expand All @@ -215,8 +215,7 @@ def _fetch_hessian_approximations(self, granularity: HessianScoresGranularity) -
node_approximations = self.hessian_service.fetch_hessian(
hessian_scores_request=hessian_scores_request,
required_size=self.gptq_config.hessian_weights_config.hessians_num_samples,
batch_size=self.gptq_config.hessian_weights_config.hessian_batch_size,
per_sample_hash=self.gptq_config.hessian_weights_config.per_sample
batch_size=self.gptq_config.hessian_weights_config.hessian_batch_size
)
return node_approximations

Expand Down
6 changes: 2 additions & 4 deletions model_compression_toolkit/gptq/pytorch/gptq_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def sample_layer_attention_loss(y_list: List[torch.Tensor],
y_list: First list of tensors.
x_list: Second list of tensors.
fxp_w_list, flp_w_list, act_bn_mean, act_bn_std: unused (needed to comply with the interface).
loss_weights: A list of weights for each layer. Each weight is a vector of shape (batch,)
loss_weights: layer-sample weights tensor of shape (layers, batch)
Returns:
Sample Layer Attention loss (a scalar).
Expand All @@ -88,14 +88,12 @@ def sample_layer_attention_loss(y_list: List[torch.Tensor],
layers_mean_w = []

for i, (y, x, w) in enumerate(zip(y_list, x_list, loss_weights)):
# norm = (y - x).pow(2).sum(1)
norm = (y - x).pow(2).mean(1)
norm = (y - x).pow(2).sum(1)
if len(norm.shape) > 1:
norm = norm.flatten(1).mean(1)
loss += torch.mean(w * norm)
layers_mean_w.append(w.mean())

# loss = loss / len(x_list)
loss = loss / torch.stack(layers_mean_w).max()
return loss

52 changes: 32 additions & 20 deletions model_compression_toolkit/gptq/pytorch/gptq_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,16 @@ def _get_total_grad_steps():
trainable_bias,
trainable_threshold)
hessian_cfg = self.gptq_config.hessian_weights_config
self.weights_for_average_loss = None # for fixed layer weights
self.use_sample_layer_attention = hessian_cfg.per_sample
self.hessian_score_per_layer = None # for fixed layer weights
self.hessian_score_per_image_per_layer = None # for sample-layer attention
if hessian_cfg.per_sample:
if self.use_sample_layer_attention:
assert (hessian_cfg.norm_scores is False and hessian_cfg.log_norm is False and
hessian_cfg.scale_log_norm is False), hessian_cfg
# TODO if a representative dataset is fixed (same images in each epoch) we can precalculate.
# However if images differ between epochs, we have to calculate their hessians each time and pre-calculation
# will be a waste. Currently it is calculated on-demand during the training loop.
# Per sample hessian scores are calculated on-demand during the training loop
self.hessian_score_per_image_per_layer = {}
else:
self.weights_for_average_loss = to_torch_tensor(self.compute_hessian_based_weights())
self.hessian_score_per_layer = to_torch_tensor(self.compute_hessian_based_weights())

self.reg_func = get_regularization(self.gptq_config, _get_total_grad_steps)

Expand Down Expand Up @@ -221,14 +220,16 @@ def train(self, representative_data_gen: Callable):
def compute_gradients(self,
y_float: List[torch.Tensor],
input_tensors: List[torch.Tensor],
weights_for_average_loss) -> Tuple[torch.Tensor, List[np.ndarray]]:
distill_loss_weights: torch.Tensor,
round_reg_weights: torch.Tensor) -> Tuple[torch.Tensor, List[np.ndarray]]:
"""
Get outputs from both teacher and student networks. Compute the observed error,
and use it to compute the gradients and applying them to the student weights.
Args:
y_float: A list of reference tensor from the floating point network.
input_tensors: A list of Input tensors to pass through the networks.
weights_for_average_loss: Weights for loss. Either per layer, or per layer per sample.
distill_loss_weights: Weights for the distillation loss.
round_reg_weights: Weight for the rounding regularization loss.
Returns:
Loss and gradients.
"""
Expand All @@ -243,9 +244,8 @@ def compute_gradients(self,
self.flp_weights_list,
self.compare_points_mean,
self.compare_points_std,
weights_for_average_loss)

reg_value = self.reg_func(self.fxp_model, self.gptq_config.regularization_factor)
distill_loss_weights)
reg_value = self.reg_func(self.fxp_model, self.gptq_config.regularization_factor, round_reg_weights)

loss_value += reg_value

Expand Down Expand Up @@ -273,11 +273,11 @@ def micro_training_loop(self,
for _ in epochs_pbar:
with tqdm(data_function(), position=1, leave=False) as data_pbar:
for data in data_pbar:
weights = to_torch_tensor(self._get_samples_weights_for_loss(data))
distill_weights, reg_weights = to_torch_tensor(self._get_loss_weights(data))
input_data = [d * self.input_scale for d in data]
input_tensor = to_torch_tensor(input_data)
y_float = self.float_model(input_tensor) # running float model
loss_value, grads = self.compute_gradients(y_float, input_tensor, weights)
loss_value, grads = self.compute_gradients(y_float, input_tensor, distill_weights, reg_weights)
# Run one step of gradient descent by updating the value of the variables to minimize the loss.
for (optimizer, _) in self.optimizer_with_param:
optimizer.step()
Expand All @@ -289,13 +289,22 @@ def micro_training_loop(self,
self.loss_list.append(loss_value.item())
Logger.debug(f'last loss value: {self.loss_list[-1]}')

# TODO move to common after ctor refactor
def _get_samples_weights_for_loss(self, input_tensors: List[torch.Tensor]):
if self.weights_for_average_loss is not None:
assert self.hessian_score_per_image_per_layer is None
return self.weights_for_average_loss
def _get_loss_weights(self, input_tensors: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Fetches weights for distillation and round regularization parts of loss.
Args:
input_tensors: list containing a batch of inputs.
Returns:
A tuple of two tensors:
- weights for distillation loss
- weights for rounding regularization loss
"""
if self.use_sample_layer_attention is False:
return self.hessian_score_per_layer, torch.ones_like(self.hessian_score_per_layer)

# assert self.hessian_score_per_image_per_layer
if len(input_tensors) > 1:
raise NotImplementedError('Sample-Layer attention is not currently supported for networks with multiple inputs')

Expand All @@ -309,7 +318,10 @@ def _get_samples_weights_for_loss(self, input_tensors: List[torch.Tensor]):
img_scores_per_layer: Dict[BaseNode, np.ndarray] = self.hessian_score_per_image_per_layer[img_hash]
img_scores = np.stack(list(img_scores_per_layer.values()), axis=0)
scores.append(img_scores)
return np.stack(scores, axis=1) # layers X images

layer_sample_weights = np.stack(scores, axis=1) # layers X images
layer_weights = layer_sample_weights.mean(axis=1)
return layer_sample_weights, layer_weights

def update_graph(self) -> Graph:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ def get_regularization(gptq_config: GradientPTQConfig, get_total_grad_steps_fn:
scheduler = LinearAnnealingScheduler(t_start=t_start, t_end=total_gradient_steps, initial_val=20, target_val=2)
return SoftQuantizerRegularization(scheduler)
else:
return lambda m, e_reg: 0
return lambda *args, **kwargs: 0
Loading

0 comments on commit 4b84a90

Please sign in to comment.