From 2ddde3b8992af831d8af7c07cbfc1b0cf263a607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Fri, 21 Feb 2025 16:31:22 +0800 Subject: [PATCH 1/4] fixbug: #1709 --- metagpt/provider/openai_api.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/metagpt/provider/openai_api.py b/metagpt/provider/openai_api.py index bd9c02231f..3cfdf7d283 100644 --- a/metagpt/provider/openai_api.py +++ b/metagpt/provider/openai_api.py @@ -126,19 +126,16 @@ async def _achat_completion_stream(self, messages: list[dict], timeout=USE_CONFI return full_reply_content def _cons_kwargs(self, messages: list[dict], timeout=USE_CONFIG_TIMEOUT, **extra_kwargs) -> dict: + max_token_key = self._get_max_tokens_key() kwargs = { "messages": messages, - "max_tokens": self._get_max_tokens(messages), + max_token_key: self._get_max_tokens(messages), # "n": 1, # Some services do not provide this parameter, such as mistral # "stop": None, # default it's None and gpt4-v can't have this one "temperature": self.config.temperature, "model": self.model, "timeout": self.get_timeout(timeout), } - if "o1-" in self.model: - # compatible to openai o1-series - kwargs["temperature"] = 1 - kwargs.pop("max_tokens") if extra_kwargs: kwargs.update(extra_kwargs) return kwargs @@ -309,3 +306,10 @@ async def gen_image( img_url_or_b64 = item.url if resp_format == "url" else item.b64_json imgs.append(decode_image(img_url_or_b64)) return imgs + + def _get_max_tokens_key(self) -> str: + pattern = r"^o\d+(-\w+)*$" + if re.match(pattern, self.model): + # o1 series, see more https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_tokens + return "max_completion_tokens" + return "max_tokens" From 2d33566dde23956f1159327aae06768d39c20490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Fri, 21 Feb 2025 16:50:54 +0800 Subject: [PATCH 2/4] fixbug: #1709 --- metagpt/utils/token_counter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagpt/utils/token_counter.py b/metagpt/utils/token_counter.py index 1bc6b597dd..2e7f037b46 100644 --- a/metagpt/utils/token_counter.py +++ b/metagpt/utils/token_counter.py @@ -483,7 +483,7 @@ def count_output_tokens(string: str, model: str) -> int: try: encoding = tiktoken.encoding_for_model(model) except KeyError: - logger.info(f"Warning: model {model} not found in tiktoken. Using cl100k_base encoding.") + logger.debug(f"Warning: model {model} not found in tiktoken. Using cl100k_base encoding.") encoding = tiktoken.get_encoding("cl100k_base") return len(encoding.encode(string)) From 0b8e9b688a441204bfa3531f4efe593abdc5b51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Tue, 4 Mar 2025 18:28:59 +0800 Subject: [PATCH 3/4] fixbug: Fixed the issue of Llama 3.2 returning multi-line JSON. --- metagpt/configs/llm_config.py | 1 - metagpt/llm.py | 10 ++++++ metagpt/provider/base_llm.py | 6 ++-- metagpt/provider/ollama_api.py | 61 ++++++++++++++++------------------ 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/metagpt/configs/llm_config.py b/metagpt/configs/llm_config.py index ef034ca494..7ca25358e8 100644 --- a/metagpt/configs/llm_config.py +++ b/metagpt/configs/llm_config.py @@ -80,7 +80,6 @@ class LLMConfig(YamlModel): frequency_penalty: float = 0.0 best_of: Optional[int] = None n: Optional[int] = None - stream: bool = True seed: Optional[int] = None # https://cookbook.openai.com/examples/using_logprobs logprobs: Optional[bool] = None diff --git a/metagpt/llm.py b/metagpt/llm.py index 465e419a16..1e8f3d06a4 100644 --- a/metagpt/llm.py +++ b/metagpt/llm.py @@ -18,3 +18,13 @@ def LLM(llm_config: Optional[LLMConfig] = None, context: Context = None) -> Base if llm_config is not None: return ctx.llm_with_cost_manager_from_llm_config(llm_config) return ctx.llm() + + +if __name__ == "__main__": + import asyncio + + llm = LLM() + rsp = asyncio.run(llm.aask("hello world", stream=False)) + print(f"{rsp}") + rsp = asyncio.run(llm.aask("hello world", stream=True)) + print(f"{rsp}") diff --git a/metagpt/provider/base_llm.py b/metagpt/provider/base_llm.py index a95e8dbd37..35e6c5ddb2 100644 --- a/metagpt/provider/base_llm.py +++ b/metagpt/provider/base_llm.py @@ -131,8 +131,8 @@ async def aask( system_msgs: Optional[list[str]] = None, format_msgs: Optional[list[dict[str, str]]] = None, images: Optional[Union[str, list[str]]] = None, - timeout=USE_CONFIG_TIMEOUT, - stream=None, + timeout: int = USE_CONFIG_TIMEOUT, + stream: bool = True, ) -> str: if system_msgs: message = self._system_msgs(system_msgs) @@ -146,8 +146,6 @@ async def aask( message.append(self._user_msg(msg, images=images)) else: message.extend(msg) - if stream is None: - stream = self.config.stream logger.debug(message) rsp = await self.acompletion_text(message, stream=stream, timeout=self.get_timeout(timeout)) return rsp diff --git a/metagpt/provider/ollama_api.py b/metagpt/provider/ollama_api.py index 3f7d20d0ac..b2b642de0c 100644 --- a/metagpt/provider/ollama_api.py +++ b/metagpt/provider/ollama_api.py @@ -1,10 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # @Desc : self-host open llm model with ollama which isn't openai-api-compatible +# @Modified by : mashenquan. Tested with llama 3.2, https://www.ollama.com/library/llama3.2 import json from enum import Enum, auto -from typing import AsyncGenerator, Optional, Tuple +from typing import AsyncGenerator, List, Optional, Tuple from metagpt.configs.llm_config import LLMConfig, LLMType from metagpt.const import USE_CONFIG_TIMEOUT @@ -38,7 +39,21 @@ def apply(self, messages: list[dict]) -> dict: raise NotImplementedError def decode(self, response: OpenAIResponse) -> dict: - return json.loads(response.data.decode("utf-8")) + data = response.data.decode("utf-8") + rsp = {} + content = "" + for val in data.splitlines(): + if not val: + continue + m = json.loads(val) + if "embedding" in m: + return m + content += m.get("message", {}).get("content", "") + rsp.update(m) + if "message" not in rsp: + rsp["message"] = {} + rsp["message"]["content"] = content + return rsp def get_choice(self, to_choice_dict: dict) -> str: raise NotImplementedError @@ -204,16 +219,14 @@ def __init__(self, config: LLMConfig): def _llama_api_inuse(self) -> OllamaMessageAPI: return OllamaMessageAPI.CHAT - @property - def _llama_api_kwargs(self) -> dict: - return {"options": {"temperature": 0.3}, "stream": self.config.stream} - def __init_ollama(self, config: LLMConfig): assert config.base_url, "ollama base url is required!" self.model = config.model self.pricing_plan = self.model ollama_message = OllamaMessageMeta.get_message(self._llama_api_inuse) - self.ollama_message = ollama_message(model=self.model, **self._llama_api_kwargs) + options = {"temperature": config.temperature} + self.ollama_message = ollama_message(model=self.model, options=options) + self.ollama_stream = ollama_message(model=self.model, options=options, stream=True) def get_usage(self, resp: dict) -> dict: return {"prompt_tokens": resp.get("prompt_eval_count", 0), "completion_tokens": resp.get("eval_count", 0)} @@ -225,12 +238,7 @@ async def _achat_completion(self, messages: list[dict], timeout: int = USE_CONFI params=self.ollama_message.apply(messages=messages), request_timeout=self.get_timeout(timeout), ) - if isinstance(resp, AsyncGenerator): - return await self._processing_openai_response_async_generator(resp) - elif isinstance(resp, OpenAIResponse): - return self._processing_openai_response(resp) - else: - raise ValueError + return self._processing_openai_response(resp) def get_choice_text(self, rsp): return self.ollama_message.get_choice(rsp) @@ -241,17 +249,12 @@ async def acompletion(self, messages: list[dict], timeout=USE_CONFIG_TIMEOUT) -> async def _achat_completion_stream(self, messages: list[dict], timeout: int = USE_CONFIG_TIMEOUT) -> str: resp, _, _ = await self.client.arequest( method=self.http_method, - url=self.ollama_message.api_suffix, - params=self.ollama_message.apply(messages=messages), + url=self.ollama_stream.api_suffix, + params=self.ollama_stream.apply(messages=messages), request_timeout=self.get_timeout(timeout), stream=True, ) - if isinstance(resp, AsyncGenerator): - return await self._processing_openai_response_async_generator(resp) - elif isinstance(resp, OpenAIResponse): - return self._processing_openai_response(resp) - else: - raise ValueError + return await self._processing_openai_response_async_generator(resp) def _processing_openai_response(self, openai_resp: OpenAIResponse): resp = self.ollama_message.decode(openai_resp) @@ -263,10 +266,10 @@ async def _processing_openai_response_async_generator(self, ag_openai_resp: Asyn collected_content = [] usage = {} async for raw_chunk in ag_openai_resp: - chunk = self.ollama_message.decode(raw_chunk) + chunk = self.ollama_stream.decode(raw_chunk) if not chunk.get("done", False): - content = self.ollama_message.get_choice(chunk) + content = self.ollama_stream.get_choice(chunk) collected_content.append(content) log_llm_stream(content) else: @@ -285,10 +288,6 @@ class OllamaGenerate(OllamaLLM): def _llama_api_inuse(self) -> OllamaMessageAPI: return OllamaMessageAPI.GENERATE - @property - def _llama_api_kwargs(self) -> dict: - return {"options": {"temperature": 0.3}, "stream": self.config.stream} - @register_provider(LLMType.OLLAMA_EMBEDDINGS) class OllamaEmbeddings(OllamaLLM): @@ -296,15 +295,11 @@ class OllamaEmbeddings(OllamaLLM): def _llama_api_inuse(self) -> OllamaMessageAPI: return OllamaMessageAPI.EMBEDDINGS - @property - def _llama_api_kwargs(self) -> dict: - return {"options": {"temperature": 0.3}} - @property def _llama_embedding_key(self) -> str: return "embedding" - async def _achat_completion(self, messages: list[dict], timeout: int = USE_CONFIG_TIMEOUT) -> dict: + async def _achat_completion(self, messages: list[dict], timeout: int = USE_CONFIG_TIMEOUT) -> List[float]: resp, _, _ = await self.client.arequest( method=self.http_method, url=self.ollama_message.api_suffix, @@ -313,7 +308,7 @@ async def _achat_completion(self, messages: list[dict], timeout: int = USE_CONFI ) return self.ollama_message.decode(resp)[self._llama_embedding_key] - async def _achat_completion_stream(self, messages: list[dict], timeout: int = USE_CONFIG_TIMEOUT) -> str: + async def _achat_completion_stream(self, messages: list[dict], timeout: int = USE_CONFIG_TIMEOUT) -> List[float]: return await self._achat_completion(messages, timeout=self.get_timeout(timeout)) def get_choice_text(self, rsp): From 229b90b2a00a90882404fed21795cdb73e57fd61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=98=E6=9D=83=20=E9=A9=AC?= Date: Tue, 4 Mar 2025 18:30:39 +0800 Subject: [PATCH 4/4] fixbug: Fixed the issue of Llama 3.2 returning multi-line JSON. --- metagpt/provider/ollama_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metagpt/provider/ollama_api.py b/metagpt/provider/ollama_api.py index b2b642de0c..9944689445 100644 --- a/metagpt/provider/ollama_api.py +++ b/metagpt/provider/ollama_api.py @@ -1,7 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # @Desc : self-host open llm model with ollama which isn't openai-api-compatible -# @Modified by : mashenquan. Tested with llama 3.2, https://www.ollama.com/library/llama3.2 +# @Modified by : mashenquan. Tested with llama 3.2, https://www.ollama.com/library/llama3.2; +# nomic-embed-text, https://www.ollama.com/library/nomic-embed-text import json from enum import Enum, auto