Skip to content

Commit 3a5f415

Browse files
committed
fix(translator): robust LLM response parsing and expose language in API
Normalize model output lines (markdown, list prefixes, case) so LANGUAGE/TRANSLATION are detected reliably and NodeBB no longer mislabels non-English as English when the model formats replies loosely. - Add _normalize_response_line and extend _parse_model_content to return detected language - Return optional language from translate_content/query_llm_robust; include in TranslateResponse - Expand unit tests for **LANGUAGE:**, numbered lists, and Language:/Translation: casing
1 parent 9d5e520 commit 3a5f415

3 files changed

Lines changed: 77 additions & 27 deletions

File tree

src/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,17 @@
77
class TranslateResponse(BaseModel):
88
is_english: bool
99
translated_content: str
10+
language: str | None = None
1011

1112

1213
app = FastAPI()
1314

1415

1516
@app.get("/")
1617
def translator_root(content: str = Query(default="")) -> TranslateResponse:
17-
is_english, translated_content = translate_content(content.strip())
18+
is_english, translated_content, language = translate_content(content.strip())
1819
return TranslateResponse(
1920
is_english=is_english,
2021
translated_content=translated_content,
22+
language=language,
2123
)

src/translator.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,34 @@ def _user_prompt(post: str) -> str:
2828
Text: {post}"""
2929

3030

31-
def _parse_model_content(raw: str, post: str) -> tuple[bool, str]:
31+
def _normalize_response_line(line: str) -> str:
32+
"""Strip list markers and leading markdown so LANGUAGE:/TRANSLATION: can be found."""
33+
s = line.strip()
34+
s = re.sub(r"^(\d+\.|[*•-])\s+", "", s)
35+
while s.startswith("*"):
36+
s = s[1:].lstrip()
37+
return s.lstrip()
38+
39+
40+
def _parse_model_content(raw: str, post: str) -> tuple[bool, str, str | None]:
3241
content = raw.strip()
3342
if "</redacted_thinking>" in content:
3443
content = content.split("</redacted_thinking>")[-1].strip()
35-
language = None
44+
detected_language: str | None = None
3645
translation = post
3746
for line in content.splitlines():
38-
if line.startswith("LANGUAGE:"):
39-
language = line[len("LANGUAGE:") :].strip()
40-
elif line.startswith("TRANSLATION:"):
41-
translation = line[len("TRANSLATION:") :].strip()
42-
if language is None:
43-
return (True, post)
44-
return (language.lower() == "english", translation)
47+
norm = _normalize_response_line(line)
48+
low = norm.lower()
49+
if low.startswith("language:"):
50+
detected_language = norm.split(":", 1)[1].strip()
51+
detected_language = detected_language.strip("*").strip()
52+
elif low.startswith("translation:"):
53+
translation = norm.split(":", 1)[1].strip()
54+
translation = translation.strip("*").strip()
55+
if detected_language is None:
56+
return (True, post, None)
57+
is_english = detected_language.lower() == "english"
58+
return (is_english, translation, detected_language)
4559

4660

4761
def _httpx_timeout() -> httpx.Timeout:
@@ -55,12 +69,12 @@ def _strip_html(text: str) -> str:
5569
return re.sub(r"<[^>]+>", "", text).strip()
5670

5771

58-
def translate_content(content: str) -> tuple[bool, str]:
72+
def translate_content(content: str) -> tuple[bool, str, str | None]:
5973
plain = _strip_html(content) if content else content
6074
return query_llm_robust(plain or content)
6175

6276

63-
def query_llm_robust(post: str) -> tuple[bool, str]:
77+
def query_llm_robust(post: str) -> tuple[bool, str, str | None]:
6478
url = f"{_ollama_base_url()}/api/chat"
6579
payload: dict[str, Any] = {
6680
"model": _ollama_model(),
@@ -73,13 +87,13 @@ def query_llm_robust(post: str) -> tuple[bool, str]:
7387
response.raise_for_status()
7488
data = response.json()
7589
except Exception:
76-
return (True, post)
90+
return (True, post, None)
7791

7892
message = data.get("message")
7993
if not isinstance(message, dict):
80-
return (True, post)
94+
return (True, post, None)
8195
content = message.get("content")
8296
if not isinstance(content, str):
83-
return (True, post)
97+
return (True, post, None)
8498

8599
return _parse_model_content(content, post)

test/unit/test_translator.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,49 +15,77 @@
1515

1616

1717
@pytest.mark.parametrize(
18-
("raw", "post", "expected_english", "expected_text"),
18+
("raw", "post", "expected_english", "expected_text", "expected_language"),
1919
[
2020
(
2121
"LANGUAGE: English\nTRANSLATION: Hello, world.",
2222
"Hello, world.",
2323
True,
2424
"Hello, world.",
25+
"English",
2526
),
2627
(
2728
"LANGUAGE: French\nTRANSLATION: Good day.",
2829
"Bonjour.",
2930
False,
3031
"Good day.",
32+
"French",
3133
),
3234
(
3335
"TRANSLATION: only this line",
3436
"some input",
3537
True,
3638
"some input",
39+
None,
3740
),
3841
(
3942
"</redacted_thinking>\nLANGUAGE: Spanish\nTRANSLATION: Hello.",
4043
"Hola",
4144
False,
4245
"Hello.",
46+
"Spanish",
4347
),
4448
(
4549
"LANGUAGE: German\nTRANSLATION: Hi there",
4650
"src",
4751
False,
4852
"Hi there",
53+
"German",
4954
),
5055
(
5156
"No LANGUAGE line at all.\nJust prose.",
5257
"orig",
5358
True,
5459
"orig",
60+
None,
5561
),
5662
(
5763
"LANGUAGE: english\nTRANSLATION: Same",
5864
"x",
5965
True,
6066
"Same",
67+
"english",
68+
),
69+
(
70+
"**LANGUAGE:** French\nTRANSLATION: Hello",
71+
"Bonjour",
72+
False,
73+
"Hello",
74+
"French",
75+
),
76+
(
77+
"1. LANGUAGE: French\nTRANSLATION: Hello",
78+
"Bonjour",
79+
False,
80+
"Hello",
81+
"French",
82+
),
83+
(
84+
"Language: French\nTranslation: Hello",
85+
"Bonjour",
86+
False,
87+
"Hello",
88+
"French",
6189
),
6290
],
6391
)
@@ -66,8 +94,13 @@ def test_parse_model_content(
6694
post: str,
6795
expected_english: bool,
6896
expected_text: str,
97+
expected_language: str | None,
6998
) -> None:
70-
assert _parse_model_content(raw, post) == (expected_english, expected_text)
99+
assert _parse_model_content(raw, post) == (
100+
expected_english,
101+
expected_text,
102+
expected_language,
103+
)
71104

72105

73106
def test_user_prompt_includes_post_text() -> None:
@@ -94,9 +127,10 @@ def test_query_llm_robust_posts_chat_and_parses_response(monkeypatch: pytest.Mon
94127
)
95128
)
96129

97-
is_english, text = query_llm_robust("in")
130+
is_english, text, language = query_llm_robust("in")
98131
assert is_english is True
99132
assert text == "out"
133+
assert language == "English"
100134
assert route.called
101135
payload = json.loads(route.calls[0].request.content.decode())
102136
assert payload["model"] == "qwen3:0.6b"
@@ -112,7 +146,7 @@ def test_query_llm_robust_connect_error_returns_original(monkeypatch: pytest.Mon
112146
respx.post(_DEFAULT_CHAT_URL).mock(
113147
side_effect=httpx.ConnectError("refused", request=req),
114148
)
115-
assert query_llm_robust("fall") == (True, "fall")
149+
assert query_llm_robust("fall") == (True, "fall", None)
116150

117151

118152
@respx.mock
@@ -123,7 +157,7 @@ def test_query_llm_robust_missing_message_dict_returns_original(
123157
respx.post(_DEFAULT_CHAT_URL).mock(
124158
return_value=httpx.Response(200, json={"done": True}),
125159
)
126-
assert query_llm_robust("z") == (True, "z")
160+
assert query_llm_robust("z") == (True, "z", None)
127161

128162

129163
@respx.mock
@@ -137,20 +171,20 @@ def test_query_llm_robust_non_string_message_content_returns_original(
137171
json={"message": {"role": "assistant", "content": None}, "done": True},
138172
),
139173
)
140-
assert query_llm_robust("y") == (True, "y")
174+
assert query_llm_robust("y") == (True, "y", None)
141175

142176

143177
def test_translate_content_delegates_to_query_llm_robust(
144178
monkeypatch: pytest.MonkeyPatch,
145179
) -> None:
146180
calls: list[str] = []
147181

148-
def fake(post: str) -> tuple[bool, str]:
182+
def fake(post: str) -> tuple[bool, str, str | None]:
149183
calls.append(post)
150-
return (True, "ok")
184+
return (True, "ok", "English")
151185

152186
monkeypatch.setattr("src.translator.query_llm_robust", fake)
153-
assert translate_content("hi") == (True, "ok")
187+
assert translate_content("hi") == (True, "ok", "English")
154188
assert calls == ["hi"]
155189

156190

@@ -159,10 +193,10 @@ def test_translate_content_strips_html(
159193
) -> None:
160194
calls: list[str] = []
161195

162-
def fake(post: str) -> tuple[bool, str]:
196+
def fake(post: str) -> tuple[bool, str, str | None]:
163197
calls.append(post)
164-
return (False, "translated")
198+
return (False, "translated", "French")
165199

166200
monkeypatch.setattr("src.translator.query_llm_robust", fake)
167-
assert translate_content("<p>Bonjour</p>") == (False, "translated")
201+
assert translate_content("<p>Bonjour</p>") == (False, "translated", "French")
168202
assert calls == ["Bonjour"]

0 commit comments

Comments
 (0)