InternLM · Tsundoku958 · Nov 14, 2025 · grimoire · Nov 16, 2025 · Tsundoku958
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -246,7 +246,11 @@ def _reorder_waiting():
     def _schedule_decoding(self, prealloc_size: int = 0):
         """Schedule decoding."""
 
-        running = self.running
+        def _reorder_running():
+            """Reorder running."""
+            return sorted(self.running, key=lambda seq: seq.arrive_time)
+
+        running = _reorder_running()
         assert len(running) != 0
 
         eviction_helper = self.eviction_helper
@@ -270,9 +274,9 @@ def __evict_for_seq(seq: SchedulerSequence, num_required_blocks: int):
             return eviction_helper.evict_for_seq(seq, evictable, prealloc_size)
 
         # 1. running
-        for seq in running:
+        while len(running) > 0:
             # token + n
-
+            seq = running.pop(0)
             num_required_blocks = self.block_manager.num_required_blocks(seq, prealloc_size)
             if len(seq.logical_blocks) + num_required_blocks > self.block_manager.num_gpu_blocks:
                 # Reach max gpu cache size.
@@ -284,7 +288,13 @@ def __evict_for_seq(seq: SchedulerSequence, num_required_blocks: int):
                 seq.set_step(0)
                 continue
 
-            if not __evict_for_seq(seq, num_required_blocks):
+            while not __evict_for_seq(seq, num_required_blocks):
+                if len(running) == 0:
+                    break
+                seq_preempted = running.pop(-1)
+                self._set_message_status(seq_preempted, MessageStatus.WAITING)
+
+            if self.block_manager.get_num_free_gpu_blocks() < num_required_blocks:
                 self._set_message_status(seq, MessageStatus.WAITING)
                 continue