more tweaks

Haithem Turki · Haithem Turki · commit f5597f1441f5 · 2025-10-29T00:20:45.000-07:00
diff --git a/pipeline/streaming_switch_training.py b/pipeline/streaming_switch_training.py
@@ -234,6 +234,11 @@ def generate_chunk_with_cache(
         return output, denoised_timestep_from, denoised_timestep_to
 
     def _recache_after_switch(self, output, current_start_frame, new_conditional_dict, local_start_frame=None, switch_recache_frames=None):
+        for block_idx in range(self.num_transformer_blocks):
+            cache = self.kv_cache1[block_idx]
+            # update local end index pointer so that we rebuild the cache from the beginning
+            cache["local_end_index"].fill_(cache["local_end_index"].item() - self.frame_seq_length * self.slice_last_frames)
+
         # reset cross-attention cache
         for blk in self.crossattn_cache:
             blk["k"].zero_()
@@ -244,19 +249,19 @@ def _recache_after_switch(self, output, current_start_frame, new_conditional_dic
             return
 
         if switch_recache_frames is not None:
-            frames_to_recache = torch.cat([switch_recache_frames, output], dim=1)[:, -21:, ...]
+            frames_to_recache = torch.cat([switch_recache_frames, output], dim=1)[:, -self.local_attn_size:, ...]
             num_recache_frames = frames_to_recache.shape[1]
             if DEBUG and (not dist.is_initialized() or dist.get_rank() == 0):
                 print(f"[SeqTrain-DMDSwitch] Using external switch_recache_frames (previous_frames): {frames_to_recache.shape}")
         else:
             # Determine how to fetch frames based on whether local_start_frame is provided
             if local_start_frame is not None:
                 # Chunk mode: output is the current chunk's output; use relative coordinates
-                num_recache_frames = min(local_start_frame, 21)
+                num_recache_frames = min(local_start_frame, self.local_attn_size)
                 frames_to_recache = output[:, -num_recache_frames:]
             else:
                 # Full sequence mode: output is the complete sequence; use absolute coordinates
-                num_recache_frames = min(current_start_frame, 21)
+                num_recache_frames = min(current_start_frame, self.local_attn_size)
                 frames_to_recache = output[:, -num_recache_frames:]
             
         batch_size, num_recache_frames, c, h, w = frames_to_recache.shape
diff --git a/pipeline/streaming_training.py b/pipeline/streaming_training.py
@@ -48,6 +48,7 @@ def __init__(self,
         self.global_sink = kwargs.get("global_sink", False)
 
         slice_last_frames: int = int(kwargs.get("slice_last_frames", 21))
+        self.slice_last_frames = slice_last_frames
         self.kv_cache_size = (self.local_attn_size + slice_last_frames) * self.frame_seq_length
         if DEBUG:
             print(f"[KV policy] local_attn_size={self.local_attn_size} slice_last_frames={slice_last_frames} -> kv_frames={self.kv_cache_size}")
diff --git a/pipeline/switch_causal_inference.py b/pipeline/switch_causal_inference.py
@@ -63,17 +63,17 @@ def _recache_after_switch(self, output, current_start_frame, new_conditional_dic
             local_attn_size=self.local_attn_size
         )
 
-        context_timestep = torch.ones([batch_size, recompute_frames], 
+        context_timestep = torch.ones([batch_size, num_recache_frames],
                                     device=device, dtype=torch.int64) * self.args.context_noise
                     
         with torch.no_grad():
             self.generator(
-                noisy_image_or_video=frames_to_recompute,
+                noisy_image_or_video=frames_to_recache,
                 conditional_dict=new_conditional_dict,
                 timestep=context_timestep,
                 kv_cache=self.kv_cache1,
                 crossattn_cache=self.crossattn_cache,
-                current_start=recompute_start_frame * self.frame_seq_length,
+                current_start=recache_start_frame * self.frame_seq_length,
                 block_mask=block_mask,
             )
 
@@ -166,7 +166,7 @@ def inference(
             else:
                 cond_in_use = cond_second if using_second else cond_first
             
-            noisy_input = noise[:, current_start_frame - num_input_frames : current_start_frame + current_num_frames - num_input_frames]
+            noisy_input = noise[:, current_start_frame - (1 if initial_latent is not None else 0) : current_start_frame + current_num_frames - (1 if initial_latent is not None else 0)]
 
             # Spatial denoising loop (same as parent but uses cond_in_use)
             for index, current_timestep in enumerate(self.denoising_step_list):
diff --git a/wan/modules/causal_model.py b/wan/modules/causal_model.py
@@ -227,7 +227,7 @@ def qkv_fn(x):
 
             # Compute cache update parameters without modifying kv_cache directly
             cache_update_info = None
-            is_recompute = block_mask is not None
+            is_recompute = current_end <= kv_cache["global_end_index"].item() and current_start > 0
             if self.local_attn_size != -1 and (current_end > kv_cache["global_end_index"].item()) and (
                     num_new_tokens + kv_cache["local_end_index"].item() > kv_cache_size):
                 # Calculate the number of new tokens added in this step
@@ -257,8 +257,8 @@ def qkv_fn(x):
                     temp_v[:, sink_tokens + num_evicted_tokens:sink_tokens + num_evicted_tokens + num_rolled_tokens].clone()
                 
                 # Insert new key/value into the temporary cache
-                # Protect sink_tokens only during recomputation; regular forward generation allows writing into the initial sink region
-                write_start_index = max(local_start_index, sink_tokens) if (is_recompute and kv_cache.get("global_sink", False)) else local_start_index
+                # Protect sink_tokens only during recaching; regular forward generation allows writing into the initial sink region
+                write_start_index = max(local_start_index, sink_tokens) if ((block_mask is not None) and kv_cache.get("global_sink", False)) else local_start_index
                 roped_offset = max(0, write_start_index - local_start_index)
                 write_len = max(0, local_end_index - write_start_index)
                 if write_len > 0:
@@ -291,8 +291,8 @@ def qkv_fn(x):
                 # Construct full k, v for attention computation (without modifying the original cache)
                 temp_k = kv_cache["k"].clone()
                 temp_v = kv_cache["v"].clone()
-                # Protect sink_tokens only during recomputation; regular forward generation allows writing into the initial sink region
-                write_start_index = max(local_start_index, sink_tokens) if (is_recompute and kv_cache.get("global_sink", False)) else local_start_index
+                # Protect sink_tokens only during recaching; regular forward generation allows writing into the initial sink region
+                write_start_index = max(local_start_index, sink_tokens) if ((block_mask is not None) and kv_cache.get("global_sink", False)) else local_start_index
                 roped_offset = max(0, write_start_index - local_start_index)
                 write_len = max(0, local_end_index - write_start_index)
                 if write_len > 0: