Skip to content

Commit 6e5be64

Browse files
committed
feat(demohouse/video_analyser): new model
1 parent 0051907 commit 6e5be64

File tree

3 files changed

+20
-20
lines changed

3 files changed

+20
-20
lines changed

demohouse/video_analyser/README.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
### 相关模型
1616

17-
- Doubao-pro-32k:主要参与记忆信息的处理,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。
18-
- Doubao-vision-pro-32k:负责对摄像头实时捕捉的视频画面进行视觉内容理解。
17+
- Doubao-Seed-1.6-flash:负责对摄像头实时捕捉的视频画面进行视觉内容理解以及问题回答,在当前画面无法直接回答用户问题时,大语言模型将结合历史记忆提供精准答案。
1918
- Doubao-语音合成:负责将模型生成的文本回答转化为自然流畅的语音输出。
2019
- Doubao-流式语音识别:将用户的语音提问转写为文本,以便于大模型对用户问题的理解与回复。
2120

@@ -29,8 +28,7 @@
2928
- [Node.js](https://nodejs.org/) (版本 16.2.0 或更高,推荐 Node.js 18 的 LTS 版本)
3029
- 已获取火山方舟 API Key [参考文档](https://www.volcengine.com/docs/82379/1298459#api-key-%E7%AD%BE%E5%90%8D%E9%89%B4%E6%9D%83)
3130
- 获取语音技术产品的 APP ID 和 Access Token,获取方式参见【附录】
32-
- 已创建 Doubao-Vision-Pro 32K 的 endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1)
33-
- 已创建 Doubao-Pro 32K 的endpoint [参考文档](https://www.volcengine.com/docs/82379/1099522#594199f1)
31+
- 已开通 Doubao-Seed-1.6-flash 模型
3432

3533
## 快速开始
3634

@@ -44,12 +42,10 @@
4442
```
4543
2. 修改配置
4644

47-
- 修改`backend/code/config.py` 中配置,填入刚刚获取的API keys, endpoint id 和 APP ID和 Access Token
45+
- 修改`backend/code/config.py` 中配置,填入刚刚获取的 APP ID和 Access Token
4846

4947
| 配置变量名 | 说明 |
5048
| ------------ | --------------------------------- |
51-
| VLM_ENDPOINT | doubao-vision-pro 32k endpoint id |
52-
| LLM_ENDPOINT | doubao-pro 32k endpoint id |
5349
| TTS_APP_ID | 语音合成模型 APP ID |
5450
| TTS_ACCESS_TOKEN | 语音合成模型 Access Token |
5551

@@ -106,7 +102,7 @@
106102
- 实时抽帧的纯图片:前端每秒会发送一个只包含一帧图片的请求,后端会将其存入长期记忆,用于之后的回答;后端对这个请求不回复任何内容。
107103
- 包含用户提问的请求:当前端通过VAD识别到用户提问时,会将识别出来的问题文字和用户讲话时的当前图片一起发起给后端,后端会以语音形式进行回复。
108104

109-
2. 由于每次`bot/chat` 的请求都是无状态的,前端会在header中传入一个X-Context-Id,帮助后端存储和召回用户的历史视频信息。
105+
2. 由于每次`bot/chat` 的请求都是无状态的,前端会在metadata中传入一个context_id,帮助后端存储和召回用户的历史视频信息。
110106

111107
### 模型回复策略
112108

demohouse/video_analyser/backend/code/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
# See the License for the specific language governing permissions and
1010
# limitations under the License.
1111

12-
VLM_ENDPOINT = "<ENDPOINT_ID_FOR_DOUBAO_VISION_PRO>"
13-
LLM_ENDPOINT = "<ENDPOINT_ID_FOR_LLM>" # 256K model for a short term memory
12+
VISUAL_SUMMARY_ENDPOINT = "doubao-seed-1-6-flash-250615"
13+
QUESTION_ANSWER_ENDPOINT = "doubao-seed-1-6-flash-250615"
1414

1515
TTS_APP_ID = "<TTS_APP_ID>"
1616
TTS_ACCESS_TOKEN = "<TTS_ACCESS_TOKEN>"

demohouse/video_analyser/backend/code/main.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,19 @@
2020

2121
import prompt
2222
import utils
23-
from config import LLM_ENDPOINT, VLM_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID
23+
from config import VISUAL_SUMMARY_ENDPOINT, QUESTION_ANSWER_ENDPOINT, TTS_ACCESS_TOKEN, TTS_APP_ID
24+
25+
from volcenginesdkarkruntime.types.chat.completion_create_params import Thinking
26+
27+
from volcenginesdkarkruntime.types.chat.chat_completion_content_part_text_param import ChatCompletionContentPartTextParam
2428

2529
from arkitect.core.component.llm import BaseChatLanguageModel
26-
from arkitect.core.component.llm.model import (
30+
from arkitect.types.llm.model import (
2731
ArkChatCompletionChunk,
2832
ArkChatParameters,
2933
ArkChatRequest,
3034
ArkChatResponse,
3135
ArkMessage,
32-
ChatCompletionMessageTextPart,
3336
Response,
3437
)
3538
from arkitect.core.component.tts import (
@@ -58,7 +61,7 @@ async def get_request_messages_for_llm(
5861
request_messages = await contexts.get_history(context_id)
5962
if isinstance(request.messages[-1].content, list):
6063
assert isinstance(
61-
request.messages[-1].content[0], ChatCompletionMessageTextPart
64+
request.messages[-1].content[0], ChatCompletionContentPartTextParam
6265
)
6366
text = request.messages[-1].content[0].text
6467
else:
@@ -74,7 +77,7 @@ async def chat_with_vlm(
7477
parameters: ArkChatParameters,
7578
) -> Tuple[bool, Optional[AsyncIterable[ArkChatCompletionChunk]]]:
7679
vlm = BaseChatLanguageModel(
77-
endpoint_id=VLM_ENDPOINT,
80+
endpoint_id=VISUAL_SUMMARY_ENDPOINT,
7881
messages=[ArkMessage(role="system", content=prompt.VLM_CHAT_PROMPT)]
7982
+ [request.messages[-1]],
8083
parameters=parameters,
@@ -108,7 +111,7 @@ async def llm_answer(
108111
contexts, context_id, request, prompt.LLM_PROMPT
109112
)
110113
llm = BaseChatLanguageModel(
111-
endpoint_id=LLM_ENDPOINT,
114+
endpoint_id=QUESTION_ANSWER_ENDPOINT,
112115
messages=request_messages,
113116
parameters=parameters,
114117
)
@@ -180,7 +183,7 @@ async def summarize_image(
180183
ArkMessage(role="system", content=prompt.VLM_PROMPT)
181184
] + request.messages
182185
vlm = BaseChatLanguageModel(
183-
endpoint_id=VLM_ENDPOINT,
186+
endpoint_id=VISUAL_SUMMARY_ENDPOINT,
184187
messages=request_messages,
185188
parameters=parameters,
186189
)
@@ -195,7 +198,7 @@ async def default_model_calling(
195198
request: ArkChatRequest,
196199
) -> AsyncIterable[Union[ArkChatCompletionChunk, ArkChatResponse]]:
197200
# local in-memory storage should be changed to other storage in production
198-
context_id: Optional[str] = get_headers().get("X-Context-Id", None)
201+
context_id: Optional[str] = request.metadata["context_id"]
199202
assert context_id is not None
200203
contexts: utils.Storage = utils.CoroutineSafeMap.get_instance_sync()
201204
if not await contexts.contains(context_id):
@@ -205,10 +208,11 @@ async def default_model_calling(
205208
# Use VLM to summarize the image asynchronously and return immediately
206209
is_image = (
207210
isinstance(request.messages[-1].content, list)
208-
and isinstance(request.messages[-1].content[0], ChatCompletionMessageTextPart)
211+
and isinstance(request.messages[-1].content[0], ChatCompletionContentPartTextParam)
209212
and request.messages[-1].content[0].text == ""
210213
)
211214
parameters = ArkChatParameters(**request.__dict__)
215+
parameters.thinking = Thinking(type="disabled")
212216
if is_image:
213217
_ = asyncio.create_task(
214218
summarize_image(contexts, request, parameters, context_id)
@@ -248,7 +252,7 @@ async def default_model_calling(
248252
await tts_client.close()
249253
text = ""
250254
if isinstance(request.messages[-1].content, list) and isinstance(
251-
request.messages[-1].content[0], ChatCompletionMessageTextPart
255+
request.messages[-1].content[0], ChatCompletionContentPartTextParam
252256
):
253257
text = request.messages[-1].content[0].text
254258
elif isinstance(request.messages[-1].content, str):

0 commit comments

Comments
 (0)