-
Notifications
You must be signed in to change notification settings - Fork 140
Open
Description
Steps to reproduce the bug
curl -v http://localhost:6969/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "gemini-3.0-pro",
"messages": [{"role": "user", "content": "Write a 50 word story."}],
"stream": true
}'
Apparently when "stream" is set to true, it expects a StreamingResponse -> from fastapi.responses import StreamingResponse rather than just the application/json response, causing the clients to terminate quickly.
Wasted a lot of time trying to debug this, but got it working the below vibe coded changes to the file. src/app/endpoints/chat.py
# src/app/endpoints/chat.py
import time
import json
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
from app.logger import logger
from schemas.request import GeminiRequest, OpenAIChatRequest
from app.services.gemini_client import get_gemini_client, GeminiClientNotInitializedError
from app.services.session_manager import get_translate_session_manager
router = APIRouter()
def estimate_tokens(text: str) -> int:
if not text: return 0
return max(1, len(text) // 4)
@router.post("/translate")
async def translate_chat(request: GeminiRequest):
try:
gemini_client = get_gemini_client()
except GeminiClientNotInitializedError as e:
raise HTTPException(status_code=503, detail=str(e))
session_manager = get_translate_session_manager()
if not session_manager:
raise HTTPException(status_code=503, detail="Session manager is not initialized.")
try:
# This call now correctly uses the fixed session manager
response = await session_manager.get_response(request.model, request.message, request.files)
return {"response": response.text}
except Exception as e:
logger.error(f"Error in /translate endpoint: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Error during translation: {str(e)}")
def convert_to_openai_format(response_text: str, model: str, prompt_text: str, stream: bool = False):
p_tokens = estimate_tokens(prompt_text)
c_tokens = estimate_tokens(response_text)
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": model,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": response_text},
"finish_reason": "stop",
}],
"usage": {
"prompt_tokens": p_tokens,
"completion_tokens": c_tokens,
"total_tokens": p_tokens + c_tokens,
},
}
@router.post("/v1/chat/completions")
async def chat_completions(request: OpenAIChatRequest):
try:
gemini_client = get_gemini_client()
except GeminiClientNotInitializedError as e:
raise HTTPException(status_code=503, detail=str(e))
if not request.messages:
raise HTTPException(status_code=400, detail="No messages provided.")
# Build the conversation prompt
conversation_parts = []
for msg in request.messages:
role, content = msg.get("role", "user"), msg.get("content", "")
if content:
conversation_parts.append(f"{role.capitalize()}: {content}")
final_prompt = "\n\n".join(conversation_parts)
is_stream = request.stream if request.stream is not None else False
try:
# 1. Wait for the full Gemini response
response = await gemini_client.generate_content(
message=final_prompt,
model=request.model.value if request.model else "gemini-2.5-flash"
)
full_text = response.text
if is_stream:
# 2. Return as a single-chunk stream
async def single_chunk_generator():
chat_id = f"chatcmpl-{int(time.time())}"
model_name = request.model.value if request.model else "gemini"
# First chunk: The entire content
payload = {
"id": chat_id,
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model_name,
"choices": [{
"index": 0,
"delta": {"content": full_text},
"finish_reason": "stop"
}]
}
yield f"data: {json.dumps(payload)}\n\n"
# Second chunk: Mandatory [DONE] signal for OpenAI compatibility
yield "data: [DONE]\n\n"
return StreamingResponse(single_chunk_generator(), media_type="text/event-stream")
# Non-streaming (standard JSON)
return convert_to_openai_format(full_text, request.model.value if request.model else "gemini", final_prompt)
except Exception as e:
logger.error(f"Error in chat completions: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
There is also some logic to estimate tokens instead of hardcoding 0s since I initially expected the issue to be related to that.
You can use Claude/Gemini to do a diff comparison of the files and apply the patch and test the curl command.
My local copy of the code above is working well with OpenCode and curl, but you can do your local testing before merging.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels