feat: add stop string matching

niushengxiao · niushengxiao · commit b93b511a27b9 · 2025-07-14T15:38:09.000+08:00
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -428,7 +428,9 @@ async def process_single_prompt(prompt: Union[str, List[int]], prompt_index: int
             prompt, individual_sampling_params, multimodal_params, request=raw_request
         )
 
-        return await _collect_generation_results(generator, request, prompt_str, prompt_index)
+        return await _collect_generation_results(
+            generator, request, prompt_str, prompt_index, individual_sampling_params
+        )
 
     tasks = [asyncio.create_task(process_single_prompt(prompt, i)) for i, prompt in enumerate(prompts)]
 
@@ -487,7 +489,9 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     return StreamingResponse(stream_results(), media_type="text/event-stream", background=background_tasks)
 
 
-async def _collect_generation_results(generator, request: CompletionRequest, prompt: str, prompt_index: int):
+async def _collect_generation_results(
+    generator, request: CompletionRequest, prompt: str, prompt_index: int, sampling_params: SamplingParams
+):
     final_output = []
     count_output_tokens = 0
     finish_reason = None
@@ -518,9 +522,22 @@ async def _collect_generation_results(generator, request: CompletionRequest, pro
             finish_reason = finish_status.get_finish_reason()
             prompt_tokens = metadata["prompt_tokens"]
 
+    # 处理停止序列剔除
+    final_text = "".join(final_output)
+    if finish_reason == "stop" and hasattr(sampling_params, "stop_sequences") and sampling_params.stop_sequences:
+        stop_strings = sampling_params.stop_sequences.to_string()
+        valid_stop_strings = [s for s in stop_strings if s]
+        if valid_stop_strings:
+            text_len = len(final_text)
+            for stop_str in valid_stop_strings:
+                stop_len = len(stop_str)
+                if text_len >= stop_len and final_text.endswith(stop_str):
+                    final_text = final_text[:-stop_len]
+                    break
+
     return {
         "index": prompt_index,
-        "text": "".join(final_output),
+        "text": final_text,
         "finish_reason": finish_reason,
         "prompt_tokens": prompt_tokens,
         "completion_tokens": count_output_tokens,
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -10,6 +10,7 @@
 
 # 从环境变量获取最大长度限制
 STOP_SEQUENCE_MAX_LENGTH = int(os.getenv("LIGHTLLM_STOP_SEQUENCE_MAX_LENGTH", 256))
+STOP_SEQUENCE_STR_MAX_LENGTH = int(os.getenv("LIGHTLLM_STOP_SEQUENCE_STR_MAX_LENGTH", 256))
 ALLOWED_TOKEN_IDS_MAX_LENGTH = int(os.getenv("LIGHTLLM_ALLOWED_TOKEN_IDS_MAX_LENGTH", 256))
 MAX_STOP_SEQUENCES = int(os.getenv("LIGHTLLM_MAX_STOP_SEQUENCES", 10))
 REGULAR_CONSTRAINT_MAX_LENGTH = int(os.getenv("LIGHTLLM_REGULAR_CONSTRAINT_MAX_LENGTH", 2048))
@@ -22,17 +23,27 @@ class StopSequence(ctypes.Structure):
     _fields_ = [
         ("sequence", ctypes.c_int * STOP_SEQUENCE_MAX_LENGTH),
         ("size", ctypes.c_int),
+        ("sequence_str", ctypes.c_char * STOP_SEQUENCE_STR_MAX_LENGTH),
+        ("sequence_str_len", ctypes.c_int),
     ]
 
-    def initialize(self, sequence: List[int]):
+    def initialize(self, sequence: List[int], sequence_str: str = ""):
         self.size = len(sequence)
         assert self.size <= STOP_SEQUENCE_MAX_LENGTH, "stop token length too long."
         assert all(isinstance(e, int) for e in sequence), "all must be int"
         self.sequence[: self.size] = sequence[:]
 
+        sequence_str_bytes = sequence_str.encode("utf-8")
+        assert len(sequence_str_bytes) < STOP_SEQUENCE_STR_MAX_LENGTH, "stop sequence string too long."
+        self.sequence_str = sequence_str_bytes
+        self.sequence_str_len = len(sequence_str_bytes)
+
     def to_list(self):
         return list(self.sequence[0 : self.size])
 
+    def to_string(self):
+        return bytes(self.sequence_str[0 : self.sequence_str_len]).decode("utf-8")
+
 
 class StopSequenceGroups(ctypes.Structure):
     _pack_ = 4
@@ -45,8 +56,10 @@ def initialize(self, stop_sequences: Union[str, List], tokenizer):
         groups: List[List[int]] = self.stop_sentences_to_token_ids(stop_sequences, tokenizer)
         self.size = len(groups)
         assert self.size <= MAX_STOP_SEQUENCES, "Too many stop sequence groups."
+        if isinstance(stop_sequences, str):
+            stop_sequences = [stop_sequences]
         for group_idx in range(self.size):
-            self.groups[group_idx].initialize(groups[group_idx])
+            self.groups[group_idx].initialize(groups[group_idx], stop_sequences[group_idx])
 
     def stop_sentences_to_token_ids(self, stop_sequences, tokenizer):
         if stop_sequences is None:
@@ -75,6 +88,10 @@ def _stop_str_to_token_ids(self, stop_str: str, tokenizer):
     def to_list(self):
         return [self.groups[i].to_list() for i in range(self.size)]
 
+    def to_string(self):
+        # 降序匹配，在出现"\n\n"和"\n"情况时，优先匹配“\n\n”
+        return sorted([self.groups[i].to_string() for i in range(self.size)], key=len, reverse=True)
+
 
 class RegularConstraint(ctypes.Structure):
     _pack_ = 4
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -277,6 +277,7 @@ def init_all(self):
             g_infer_context.req_manager.req_sampling_params_manager.init_req_sampling_params(self)
 
             self.stop_sequences = self.sampling_param.shm_param.stop_sequences.to_list()
+            self.stop_sequences_str = self.sampling_param.shm_param.stop_sequences.to_string()
             # token healing mode 才被使用的管理对象
             if self.shm_req.prefix_token_ids.size != 0:
                 self.prefix_token_ids = self.shm_req.prefix_token_ids.get_token_ids()
@@ -344,8 +345,8 @@ def update_mtp_accepted_token_num(self, accept_token_num: int):
     def get_last_gen_token(self):
         return self.shm_req.shm_prompt_ids.arr[self.shm_req.input_len + self.cur_output_len - 1]
 
-    def update_finish_status(self, eos_ids):
-        if self._stop_sequences_matched():
+    def update_finish_status(self, eos_ids, tokenizer=None):
+        if self._stop_sequences_matched() or self._stop_sequences_str_matched(tokenizer):
             self.finish_status.set_status(FinishStatus.FINISHED_STOP)
         elif (
             self.cur_output_len > 0
@@ -373,6 +374,28 @@ def _stop_sequences_matched(self):
                         return True
         return False
 
+    def _stop_sequences_str_matched(self, tokenizer):
+        if not self.stop_sequences_str or tokenizer is None:
+            return False
+
+        max_stop_str_len = max(len(stop_str) for stop_str in self.stop_sequences_str) if self.stop_sequences_str else 0
+        if max_stop_str_len == 0:
+            return False
+
+        tail_token_len = min(self.cur_output_len, max_stop_str_len + 10)  # +10 for safety
+        if tail_token_len > 0:
+            tail_token_ids = self.shm_req.shm_prompt_ids.arr[
+                (self.shm_req.input_len + self.cur_output_len - tail_token_len) : (
+                    self.shm_req.input_len + self.cur_output_len
+                )
+            ]
+            tail_str = tokenizer.decode(tail_token_ids, skip_special_tokens=False)
+            for stop_str in self.stop_sequences_str:
+                if stop_str in tail_str:
+                    logger.info(f"Found stop sequence in tail: stop_str='{stop_str}', tail_str='{tail_str}'")
+                    return True
+        return False
+
 
 class InferReqGroup:
     def __init__(
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -14,8 +14,10 @@
 from lightllm.common.basemodel.batch_objs import ModelOutput
 from lightllm.utils.dist_utils import init_distributed_env
 from lightllm.utils.envs_utils import get_unique_server_name
+from lightllm.utils.envs_utils import enable_stop_string_match
 from lightllm.server.core.objs import ShmReqManager, StartArgs
 from lightllm.server.router.model_infer.infer_batch import g_infer_context
+from lightllm.server.tokenizer import get_tokenizer
 from lightllm.utils.dist_utils import get_global_rank, get_global_world_size, get_dp_size
 from lightllm.utils.dist_utils import get_dp_world_size, get_global_dp_rank, get_current_rank_in_dp
 from lightllm.utils.dist_utils import get_current_device_id, get_current_rank_in_node, get_node_world_size
@@ -303,7 +305,14 @@ def _post_handle(
                 continue
 
             # 更新判断请求的 finished 状态
-            req_obj.update_finish_status(self.eos_id)
+            if enable_stop_string_match():
+                if not hasattr(self, "tokenizer"):
+                    self.tokenizer = get_tokenizer(
+                        self.args.model_dir, self.args.tokenizer_mode, trust_remote_code=self.args.trust_remote_code
+                    )
+                req_obj.update_finish_status(self.eos_id, self.tokenizer)
+            else:
+                req_obj.update_finish_status(self.eos_id)
 
             if extra_post_req_handle_func is not None:
                 extra_post_req_handle_func(req_obj, next_token_id, next_token_logprob)
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -68,6 +68,11 @@ def get_lightllm_gunicorn_keep_alive():
     return int(os.getenv("LIGHTLMM_GUNICORN_KEEP_ALIVE", 10))
 
 
+@lru_cache(maxsize=None)
+def enable_stop_string_match():
+    return os.getenv("ENABLE_STOP_STRING_MATCH", "False").upper() in ["ON", "TRUE", "1"]
+
+
 @lru_cache(maxsize=None)
 def get_lightllm_websocket_max_message_size():
     """
@@ -77,15 +82,18 @@ def get_lightllm_websocket_max_message_size():
     return int(os.getenv("LIGHTLLM_WEBSOCKET_MAX_SIZE", 16 * 1024 * 1024))
 
 
-# get_redundancy_expert_ids and get_redundancy_expert_num are primarily used to obtain the IDs and number of redundant experts during inference.  
-# They depend on a configuration file specified by ep_redundancy_expert_config_path, which is a JSON formatted text file.  
-# The content format is as follows:  
-# {  
-#   "redundancy_expert_num": 1,  # Number of redundant experts per rank  
-#   "0": [0],                    # Key: layer_index (string), Value: list of original expert IDs that are redundant for this layer  
-#   "1": [0],  
-#   "default": [0]               # Default list of redundant expert IDs if layer-specific entry is not found  
-# }  
+# get_redundancy_expert_ids and get_redundancy_expert_num are primarily
+# used to obtain the IDs and number of redundant experts during inference.
+# They depend on a configuration file specified by ep_redundancy_expert_config_path,
+# which is a JSON formatted text file.
+# The content format is as follows:
+# {
+#   "redundancy_expert_num": 1,  # Number of redundant experts per rank
+#   "0": [0],                    # Key: layer_index (string),
+#                                # Value: list of original expert IDs that are redundant for this layer
+#   "1": [0],
+#   "default": [0]               # Default list of redundant expert IDs if layer-specific entry is not found
+# }
 
 
 @lru_cache(maxsize=None)