From a0bacbba6261725f5f8821d3175aa60a0cbce67d Mon Sep 17 00:00:00 2001 From: 18792752963 Date: Thu, 15 Aug 2024 19:59:03 +0800 Subject: [PATCH 1/5] =?UTF-8?q?vllm=20=E6=8E=A5=E5=8F=A3=E6=94=AF=E6=8C=81?= =?UTF-8?q?vision(minicpm-v)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 0 -> 6148 bytes .env | 27 ++++++++++++ README_OPENBAYES.md | 90 ++++++++++++++++++++++++++++++++++++++++ api/.DS_Store | Bin 0 -> 6148 bytes api/vllm_routes/chat.py | 48 +++++++++++++++++++++ server.py | 49 ++++++++++++++++++++++ 6 files changed, 214 insertions(+) create mode 100644 .DS_Store create mode 100644 .env create mode 100644 README_OPENBAYES.md create mode 100644 api/.DS_Store create mode 100644 server.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f9538889e8f39881ab88794e1723a5de06f13390 GIT binary patch literal 6148 zcmeH~JqiLr422WdLa^D=avBfd4F=H@cme+!K~NC;Il3=DjjOdR@&d^>$!yr&SL|#= zMAy&rN~9N&8QdsK3jG6tE4fXQR)F_1*w1Z(3Qz$mKn17(6&R5M zd63WMBYGx23KgIN<50l94+US|ZT=s$Fr@-i;LjA$ z>1MZC=0.4.3 +pip install -r requirements.txt +``` + +## 启动模型 + +### 环境变量含义 + + ++ `MODEL_NAME`: 模型名称,如 `chatglm4`、`qwen2`、`llama3`等 + + ++ `PROMPT_NAME`: 使用的对话模板名称,如果不指定,则将根据 `tokenizer` 找到对应的模板 + + ++ `MODEL_PATH`: 开源大模型的文件所在路径 + + ++ `TRUST_REMOTE_CODE`: 是否使用外部代码 + + ++ `TOKENIZE_MODE`(可选项): `tokenizer` 的模式,默认为 `auto` + + ++ `TENSOR_PARALLEL_SIZE`(可选项): `GPU` 数量,默认为 `1` + + ++ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh` + + ++ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率 + + ++ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量 + + ++ `MAX_NUM_SEQS`(可选项): 批量大小 + + ++ `TASKS`(可选项): `llm` 表示启动对话大模型,`rag` 表示启动文档文档相关接口,比如`embedding`、`rerank` + + +### 启动方式 + +#### 本地启动 + +根据需求修改 `.env` 文件中的环境变量 + +```shell +python server.py +``` +#### 调用样例 +```shell +curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer YOUR_API_KEY" \ +-d '{ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "这张图像有什么东西?"}, + { + "type": "image_url", + "image_url": { + "url": "https://github.com/ByungKwanLee/TroL/blob/master/figures/demo.png?raw=true" + } + } + ] + } + ], + "model": "minicpm-v" +}' +``` +### 说明 +目前只支持minicpm-v模型 +下面是最大并发量测试结果: +GPU_MEMORY_UTILIZATION=0.9 并发量10 +GPU_MEMORY_UTILIZATION=0.8 并发量14 +GPU_MEMORY_UTILIZATION=0.7 并发量20 +GPU_MEMORY_UTILIZATION=0.6 并发量28 +GPU_MEMORY_UTILIZATION=0.5 并发量30 +GPU_MEMORY_UTILIZATION=0.4 并发量36 diff --git a/api/.DS_Store b/api/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a8019daeaf69c700617744ac8bbdd6eb34be875d GIT binary patch literal 6148 zcmeHK!AiqG5S?wS-BN@e6vR`&Yr&S5fOrYD{(uoZsKkaA4W`-Br1nq>`2hVPKg92G zW_Kf&+KUH~GE-*X?CeaIc_}+t01&;|uno`v0EtRis^Rd1&^qawl&ptP=w}2l1qCz6 zV6+y^j{nF2y}LTRK#adJe1CswI8P^GtU`=<4<~VyX06tTD3vQayH#10b$RbUskxsG zvRT?2OmAs+rBoai_8`2DM)RPtf3A{j5GA9jPKbsh47t0BlA)US)GQgMI@dP~vL./)\n{question}' + request.messages = minicpmv_messages + token_ids = engine.template.convert_messages_to_ids( messages=request.messages, tools=request.tools, @@ -136,6 +181,9 @@ async def create_chat_completion( { "prompt": None, "prompt_token_ids": token_ids, + "multi_modal_data": { + "image": image + } }, sampling_params, request_id, diff --git a/server.py b/server.py new file mode 100644 index 0000000..a24c3a6 --- /dev/null +++ b/server.py @@ -0,0 +1,49 @@ +from api.config import SETTINGS +from api.models import ( + app, + EMBEDDING_MODEL, + LLM_ENGINE, + RERANK_MODEL, +) + + +prefix = SETTINGS.api_prefix + +if EMBEDDING_MODEL is not None: + from api.routes.embedding import embedding_router + + app.include_router(embedding_router, prefix=prefix, tags=["Embedding"]) + + try: + from api.routes.file import file_router + + app.include_router(file_router, prefix=prefix, tags=["File"]) + except ImportError: + pass + +if RERANK_MODEL is not None: + from api.routes.rerank import rerank_router + + app.include_router(rerank_router, prefix=prefix, tags=["Rerank"]) + + +if LLM_ENGINE is not None: + from api.routes import model_router + + app.include_router(model_router, prefix=prefix, tags=["Model"]) + + if SETTINGS.engine == "vllm": + from api.vllm_routes import chat_router as chat_router + from api.vllm_routes import completion_router as completion_router + + else: + from api.routes.chat import chat_router as chat_router + from api.routes.completion import completion_router as completion_router + + app.include_router(chat_router, prefix=prefix, tags=["Chat Completion"]) + app.include_router(completion_router, prefix=prefix, tags=["Completion"]) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host=SETTINGS.host, port=SETTINGS.port, log_level="info") From d86eb84f2df9b333017258707058512ff432cc1b Mon Sep 17 00:00:00 2001 From: baisong666 <86873674+baisong666@users.noreply.github.com> Date: Thu, 15 Aug 2024 20:07:35 +0800 Subject: [PATCH 2/5] Update README_OPENBAYES.md --- README_OPENBAYES.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md index a3f78ee..4fb8062 100644 --- a/README_OPENBAYES.md +++ b/README_OPENBAYES.md @@ -82,9 +82,9 @@ curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ ### 说明 目前只支持minicpm-v模型 下面是最大并发量测试结果: -GPU_MEMORY_UTILIZATION=0.9 并发量10 -GPU_MEMORY_UTILIZATION=0.8 并发量14 -GPU_MEMORY_UTILIZATION=0.7 并发量20 -GPU_MEMORY_UTILIZATION=0.6 并发量28 -GPU_MEMORY_UTILIZATION=0.5 并发量30 +GPU_MEMORY_UTILIZATION=0.9 并发量10 +GPU_MEMORY_UTILIZATION=0.8 并发量14 +GPU_MEMORY_UTILIZATION=0.7 并发量20 +GPU_MEMORY_UTILIZATION=0.6 并发量28 +GPU_MEMORY_UTILIZATION=0.5 并发量30 GPU_MEMORY_UTILIZATION=0.4 并发量36 From 73bdd56c466a3ae8be23f9e57b6b340939e39bd7 Mon Sep 17 00:00:00 2001 From: baisong666 <86873674+baisong666@users.noreply.github.com> Date: Thu, 15 Aug 2024 20:07:59 +0800 Subject: [PATCH 3/5] Update README_OPENBAYES.md --- README_OPENBAYES.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md index 4fb8062..4b24623 100644 --- a/README_OPENBAYES.md +++ b/README_OPENBAYES.md @@ -80,8 +80,7 @@ curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ }' ``` ### 说明 -目前只支持minicpm-v模型 -下面是最大并发量测试结果: +目前只支持minicpm-v模型,下面是最大并发量测试结果: GPU_MEMORY_UTILIZATION=0.9 并发量10 GPU_MEMORY_UTILIZATION=0.8 并发量14 GPU_MEMORY_UTILIZATION=0.7 并发量20 From c3973aaf9ce4750c3db2480b278226dd3c3962b9 Mon Sep 17 00:00:00 2001 From: baisong666 <86873674+baisong666@users.noreply.github.com> Date: Thu, 15 Aug 2024 20:09:29 +0800 Subject: [PATCH 4/5] Update README_OPENBAYES.md --- README_OPENBAYES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md index 4b24623..123163e 100644 --- a/README_OPENBAYES.md +++ b/README_OPENBAYES.md @@ -80,7 +80,7 @@ curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ }' ``` ### 说明 -目前只支持minicpm-v模型,下面是最大并发量测试结果: +目前只支持minicpm-v模型,下面是最大并发量测试结果(RTX_8000 x 2): GPU_MEMORY_UTILIZATION=0.9 并发量10 GPU_MEMORY_UTILIZATION=0.8 并发量14 GPU_MEMORY_UTILIZATION=0.7 并发量20 From 88589b725d67400b17a7a5489a9957ad4436de0d Mon Sep 17 00:00:00 2001 From: 18792752963 Date: Fri, 16 Aug 2024 20:00:39 +0800 Subject: [PATCH 5/5] =?UTF-8?q?vllm=20=E6=8E=A5=E5=8F=A3=E6=94=AF=E6=8C=81?= =?UTF-8?q?vision(internvl)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 3 ++- README_OPENBAYES.md | 16 ++++++++-------- api/.DS_Store | Bin 6148 -> 8196 bytes api/vllm_routes/chat.py | 27 ++++++++++++++++++++++----- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/.env b/.env index adeb662..c8ba9e6 100644 --- a/.env +++ b/.env @@ -22,6 +22,7 @@ DTYPE=half MAX_NUM_SEQS=10 MAX_NUM_BATCHED_TOKENS=-1 GPU_MEMORY_UTILIZATION=0.9 - +# 使用internvl模型必须设置CONTEXT_LEN=4096 +CONTEXT_LEN=4096 TASKS=llm # TASKS=llm,rag \ No newline at end of file diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md index a3f78ee..d3b4eb9 100644 --- a/README_OPENBAYES.md +++ b/README_OPENBAYES.md @@ -80,11 +80,11 @@ curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ }' ``` ### 说明 -目前只支持minicpm-v模型 -下面是最大并发量测试结果: -GPU_MEMORY_UTILIZATION=0.9 并发量10 -GPU_MEMORY_UTILIZATION=0.8 并发量14 -GPU_MEMORY_UTILIZATION=0.7 并发量20 -GPU_MEMORY_UTILIZATION=0.6 并发量28 -GPU_MEMORY_UTILIZATION=0.5 并发量30 -GPU_MEMORY_UTILIZATION=0.4 并发量36 +目前只支持minicpm-v、internvl模型 +下面是minicpm-v最大并发量测试结果: +GPU_MEMORY_UTILIZATION=0.9 并发量10 +GPU_MEMORY_UTILIZATION=0.8 并发量14 +GPU_MEMORY_UTILIZATION=0.7 并发量20 +GPU_MEMORY_UTILIZATION=0.6 并发量28 +GPU_MEMORY_UTILIZATION=0.5 并发量30 +GPU_MEMORY_UTILIZATION=0.4 并发量36 diff --git a/api/.DS_Store b/api/.DS_Store index a8019daeaf69c700617744ac8bbdd6eb34be875d..642dd58dc804161c562368afab5873be560306b9 100644 GIT binary patch literal 8196 zcmeHMU2GIp6h3F@&z)(ZLurQ}TUaTLQVSH?6dFp{Zd*emP-Oc@3of%eBORHYsXMc~ zKyA_oV~m16;opM~8lz~!gAc?sCgO|n!50yY8s$Z!Jn9qCXyUnZX9=|37a!E%+-B~X zbME=>%sKO&xqGJofQh`Z6JQ+xFgmzYYp7eM@o|1EX;NsZB#DFvFu^mtjGcB}dx>{s zh%yjmAj&|LfhYq}2L1~fpf#H}R%YE7qfsAaAj-i1G6Vem5Tk?3cqr$E3|<}71zP}; z6=V(?ozp%bz{Els59Pd&0hHzxy9W%R7;Z68z-c_j-ATqnIWMGu0}41`cx4QCC>UIw zaIwH0FfL@&M;VARurvdF?_LEta3K%=;{AIK6u_pxETryqNf*7DwC$z|x{#i+Ezj@h z_zI;J6_r)hteVxZ6Uk|}nDli&?=#Hc6QAuB`bB)wtx|knm*SI^8Z9}Zj7?Y>*GXFE%#>Q^ zQGi3F9?6*n3T!;%+WD;GrPS3K-Ol7~-8cJen>%|GzIn#yXOo_r^ZNtGl&a=E^QcLF zR>2$~Z-UnInTBV4%XPdlGv|?PiFb$~Yqj-EZDAAnwDn}(^3RH? znzW6w`UX30(7haT4GuJff0MSEu_pR!4BTF;roMWT~HB(wvJ1&@d995iTHz8elUX$KiQ#;5Bu44O$@Rk!l+ zu%^+&u$+QLF7z@%3vxY=$oAg(Gkb3@E~RxCk%7TW}e! zz=!Y&d=A&(I(!M=!VmBx`~*M4FYp`u4u8O(sNfoGz(%|u+wdXWhTE|ddvPD`$2bn+ zVI0Os@hDE9j%hT|#4HxDi05z?pT-OL0=|f^Y*}euv-V4ZMlJ zN)=L-R4p+{mg=Ro693#sO)3?~xX??zw3W`v7e`q9lD%AUG!H(k9Qap@V^!cM(3IZt%1hvrb#PZvBS)l;%<5ws~f8v6YHYt>!Fd z*SYtmn9{6P68nX^rIpkwV!lwfwlz1a5*3_Kx9@0H)D=`ta)9k@BNans_Z~%2E2yx< zSnobkfl6AaeS9J!>)&VVC3qh`fUEEsvGoS=^d{Uwz$&ao4L4#8TW|}u5<5F^7csOe zU}zsEupbW+OOFIBox&5C!jr_+C(tIQI>gko#MGy74xhni@i}}R&*MdW8DGIy@ilyt zcza23?=~ZcN?a5Y7s@lTkjy!@>zpEw5})q|N80ikSRu^o`2K&e{P+Lm!I3q}K$L-d zA_G`4kQ_)*DCJ(Ce6Jm&<0Kutald&X0|<4&cAR9e9VdDFABJ=uBUg@zhjLy>vQYcS WKLpT^&uING`)f5?{{vRu``up+#VPUt delta 151 zcmZp1XfcprU|?W$DortDU=RQ@Ie-{MGjUEV6q~50D9Qwq2aCls6fxv8lroesq;4!+ z&N$gXhGnvbz;q5nOEY5~1rvkG#|0ubpA)lW+}Ln}c`-W&hafXhFAxZD0|{4./)\n{question}' request.messages = minicpmv_messages - + + # 使用internvl模型需要解注释 + # internvl_messages = process_messages(request.messages) + # image = internvl_messages[0]['content'][0] + # question = internvl_messages[0]['content'][1] + # internvl_messages[0]['content'] = f"\n{question}\n" + # request.messages = internvl_messages + # stop_token_ids = [0, 92543, 92542, 0] + token_ids = engine.template.convert_messages_to_ids( messages=request.messages, tools=request.tools, @@ -142,13 +151,21 @@ async def create_chat_completion( "spaces_between_special_tokens", } kwargs = dictify(request, include=include) + # 使用minicpm-v模型 sampling_params = SamplingParams( stop=request.stop or [], stop_token_ids=request.stop_token_ids or [], max_tokens=request.max_tokens, **kwargs, ) - + # 使用internvl模型需要解注释 + # sampling_params = SamplingParams( + # stop=request.stop or [], + # stop_token_ids=stop_token_ids or [], + # max_tokens=request.max_tokens, + # **kwargs, + # ) + # Todo: support for lora lora_request = None try: