From 9162adcbbc1daca09e6117458221a9118cb9bef4 Mon Sep 17 00:00:00 2001 From: lujianghu Date: Tue, 28 Jan 2025 12:45:37 +0800 Subject: [PATCH] build batch and prefill request to test --- benchmarks/run_async_requests.py | 26 ++--------- tests/test_batch_request.py | 78 ++++++++++++++++++++++++++++++++ tests/test_prefill_cache.py | 56 +++++++++++++++++++++++ 3 files changed, 139 insertions(+), 21 deletions(-) create mode 100644 tests/test_batch_request.py create mode 100644 tests/test_prefill_cache.py diff --git a/benchmarks/run_async_requests.py b/benchmarks/run_async_requests.py index 1e1d6d2..6553cc9 100644 --- a/benchmarks/run_async_requests.py +++ b/benchmarks/run_async_requests.py @@ -26,28 +26,12 @@ async def requests_func(messages: List[Dict[str, Any]]): def llm_message(): messages1 = [{"role": "user", "content": "Hello, how are you?"}] - # messages2 = [{"role": "user", "content": "Hello, What's your name?"}] - # messages1 = [ - # {"role": "system", "content": "You are a helpful AI assistant."}, - # {"role": "user", "content": "今天天气怎么样"}, - # ] - messages2 = [ - {"role": "user", "content": "Hello, how are you?"}, - { - "role": "assistant", - "content": "Hello! I'm Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with any questions or tasks you might have. How can I help you today?", - }, - {"role": "user", "content": "今天天气怎么样?"}, + messages2 = [{"role": "user", "content": "Hello, What's your name?"}] + messages3 = [ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "今天天气怎么样"}, ] - # messages2 = [ - # {"role": "user", "content": "Hello, how are you?"}, - # { - # "role": "assistant", - # "content": "Hello! I'm just a virtual assistant, so I don't have feelings, but I'm here and ready to help you with whatever you need. How are you doing? 😊", - # }, - # {"role": "user", "content": "今天天气怎么样?"}, - # ] - messages_list = [messages1, messages2, messages2] + messages_list = [messages1, messages2, messages3] return messages_list diff --git a/tests/test_batch_request.py b/tests/test_batch_request.py new file mode 100644 index 0000000..672aed3 --- /dev/null +++ b/tests/test_batch_request.py @@ -0,0 +1,78 @@ +import asyncio +import random +import time +from typing import Any, Dict, List + +import aiohttp + + +async def requests_func(messages: List[Dict[str, Any]]): + url = "http://localhost:8022/v1/chat/completions" + data = { + "messages": messages, + "model": "tt", + "stream": False, + "max_tokens": 200, + } + time.sleep(random.random() * 2) + async with aiohttp.ClientSession() as session: + async with session.post(url, json=data, timeout=100) as response: + if response.status == 200: + response_data = await response.json() + print(response_data["choices"][0]["message"]["content"]) + else: + print(f"Error: {response.status}") + + +def llm_message(): + messages1 = [{"role": "user", "content": "Hello, how are you?"}] + messages2 = [{"role": "user", "content": "Hello, What's your name?"}] + messages3 = [ + {"role": "system", "content": "You are a helpful AI assistant."}, + {"role": "user", "content": "今天天气怎么样"}, + ] + messages_list = [messages1, messages2, messages3] + return messages_list + + +def mllm_message(): + messages1 = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "这张图片里面有什么?"}, + {"type": "image_url", "image_url": {"file_path": "asserts/flux_gen_image.png"}}, + ], + } + ] + # 图片太大,内存不够,TTFT 过慢 + messages2 = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is shown in this image?"}, + {"type": "image_url", "image_url": {"file_path": "asserts/image-1.png"}}, + ], + } + ] + messages_list = [messages1, messages1] + return messages_list + + +async def main(messages_list: List[List[Dict[str, Any]]]): + print("异步并发请求结果") + s1 = time.time() + await asyncio.gather(*[requests_func(messages) for messages in messages_list]) + print(f"time cost: {time.time() - s1:.4f} s") + + print("单独请求结果") + s1 = time.time() + for message in messages_list: + await requests_func(message) + print("=" * 20) + print(f"time cost: {time.time() - s1:.4f} s") + + +if __name__ == "__main__": + asyncio.run(main(llm_message())) + # asyncio.run(main(mllm_message())) diff --git a/tests/test_prefill_cache.py b/tests/test_prefill_cache.py new file mode 100644 index 0000000..af2dc82 --- /dev/null +++ b/tests/test_prefill_cache.py @@ -0,0 +1,56 @@ +import asyncio +import random +import time +from typing import Any, Dict, List + +import aiohttp + + +async def requests_func(messages: List[Dict[str, Any]]): + url = "http://localhost:8022/v1/chat/completions" + data = { + "messages": messages, + "model": "tt", + "stream": False, + "max_tokens": 200, + } + time.sleep(random.random() * 2) + async with aiohttp.ClientSession() as session: + async with session.post(url, json=data, timeout=100) as response: + if response.status == 200: + response_data = await response.json() + print(response_data["choices"][0]["message"]["content"]) + else: + print(f"Error: {response.status}") + + +def llm_message(): + messages1 = [{"role": "user", "content": "Hello, how are you?"}] + messages2 = [ + {"role": "user", "content": "Hello, how are you?"}, + { + "role": "assistant", + "content": "Hello! I'm Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with any questions or tasks you might have. How can I help you today?", + }, + {"role": "user", "content": "今天天气怎么样?"}, + ] + messages_list = [messages1, messages2, messages2] + return messages_list + + +async def main(messages_list: List[List[Dict[str, Any]]]): + # print("异步并发请求结果") + # s1 = time.time() + # await asyncio.gather(*[requests_func(messages) for messages in messages_list]) + # print(f"time cost: {time.time() - s1:.4f} s") + + print("单独请求结果") + s1 = time.time() + for message in messages_list: + await requests_func(message) + print("=" * 20) + print(f"time cost: {time.time() - s1:.4f} s") # observe second request ttft + + +if __name__ == "__main__": + asyncio.run(main(llm_message()))