Skip to content

Commit a1863d4

Browse files
Copilotbact
andcommitted
Rename Stats→Stat (singular) and global→global_ in TokenizationStat
- CharLevelStats → CharLevelStat (singular, matches BleuScore/RougeScore) - WordLevelStats → WordLevelStat - GlobalStats → GlobalStat - TokenizationStats → TokenizationStat; converted from functional TypedDict form to class form now that global_ is a valid identifier - "global" key → "global_" in compute_stats() return value - Update __init__.py __all__ and imports - Update tests: import names, assertIn("global_"), typed annotations - Update CHANGELOG migration notes Co-authored-by: bact <128572+bact@users.noreply.github.com> Agent-Logs-Url: https://github.com/PyThaiNLP/pythainlp/sessions/d0fa4ca1-06a6-48cc-b353-4df24554e700
1 parent 61f0c8f commit a1863d4

File tree

4 files changed

+42
-44
lines changed

4 files changed

+42
-44
lines changed

CHANGELOG.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,21 @@ and this project adheres to
3535
fmeasure = scores["rouge1"]["fmeasure"]
3636
```
3737

38-
- `CharLevelStats`, `WordLevelStats`, `GlobalStats`, and `TokenizationStats`
38+
- `CharLevelStat`, `WordLevelStat`, `GlobalStat`, and `TokenizationStat`
3939
TypedDicts in `pythainlp.benchmarks`: give named, type-safe access to the
40-
dict returned by `word_tokenization.compute_stats()`.
40+
dict returned by `word_tokenization.compute_stats()`. The global-level key
41+
is `"global_"` (trailing underscore avoids the Python reserved word).
4142

4243
```python
4344
# Before (opaque nested dict)
4445
result = compute_stats(ref, hyp)
4546
tp = result["char_level"]["tp"]
4647

47-
# After (same access, now type-safe with TokenizationStats)
48-
from pythainlp.benchmarks import TokenizationStats
49-
result: TokenizationStats = compute_stats(ref, hyp)
48+
# After (same access, now type-safe with TokenizationStat)
49+
from pythainlp.benchmarks import TokenizationStat
50+
result: TokenizationStat = compute_stats(ref, hyp)
5051
tp = result["char_level"]["tp"]
52+
indicators = result["global_"]["tokenisation_indicators"]
5153
```
5254

5355
- `CorefResult` TypedDict is now exported from `pythainlp.coref`.

pythainlp/benchmarks/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55

66
__all__: list[str] = [
77
"BleuScore",
8-
"CharLevelStats",
9-
"GlobalStats",
8+
"CharLevelStat",
9+
"GlobalStat",
1010
"RougeScore",
11-
"TokenizationStats",
12-
"WordLevelStats",
11+
"TokenizationStat",
12+
"WordLevelStat",
1313
"benchmark",
1414
"bleu_score",
1515
"character_error_rate",
@@ -26,9 +26,9 @@
2626
word_error_rate,
2727
)
2828
from pythainlp.benchmarks.word_tokenization import (
29-
CharLevelStats,
30-
GlobalStats,
31-
TokenizationStats,
32-
WordLevelStats,
29+
CharLevelStat,
30+
GlobalStat,
31+
TokenizationStat,
32+
WordLevelStat,
3333
benchmark,
3434
)

pythainlp/benchmarks/word_tokenization.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
TAILING_SEP_RX: re.Pattern[str] = re.compile(f"{re.escape(SEPARATOR)}$")
3131

3232

33-
class CharLevelStats(TypedDict):
33+
class CharLevelStat(TypedDict):
3434
"""Character-level confusion matrix statistics for tokenization."""
3535

3636
tp: int
@@ -39,30 +39,26 @@ class CharLevelStats(TypedDict):
3939
fn: int
4040

4141

42-
class WordLevelStats(TypedDict):
42+
class WordLevelStat(TypedDict):
4343
"""Word-level tokenization statistics."""
4444

4545
correctly_tokenised_words: int
4646
total_words_in_sample: int
4747
total_words_in_ref_sample: int
4848

4949

50-
class GlobalStats(TypedDict):
51-
"""Global tokenization indicators as a binary indicator string."""
50+
class GlobalStat(TypedDict):
51+
"""Global tokenization indicator as a binary indicator string."""
5252

5353
tokenisation_indicators: str
5454

5555

56-
# Functional form is required because 'global' is a Python reserved keyword.
57-
TokenizationStats = TypedDict(
58-
"TokenizationStats",
59-
{
60-
"char_level": CharLevelStats,
61-
"word_level": WordLevelStats,
62-
"global": GlobalStats,
63-
},
64-
)
65-
"""Tokenization quality statistics at character, word, and global level."""
56+
class TokenizationStat(TypedDict):
57+
"""Tokenization quality statistics at character, word, and global level."""
58+
59+
char_level: CharLevelStat
60+
word_level: WordLevelStat
61+
global_: GlobalStat
6662

6763

6864
def _f1(precision: float, recall: float) -> float:
@@ -81,7 +77,7 @@ def _f1(precision: float, recall: float) -> float:
8177

8278
@overload
8379
def _flatten_result(
84-
my_dict: TokenizationStats, sep: str = ...
80+
my_dict: TokenizationStat, sep: str = ...
8581
) -> dict[str, Union[int, str]]: ...
8682

8783

@@ -105,7 +101,7 @@ def _flatten_result(
105101
106102
107103
:param my_dict: dictionary containing stats
108-
:type my_dict: TokenizationStats or
104+
:type my_dict: TokenizationStat or
109105
collections.abc.Mapping[str, collections.abc.Mapping[str, Union[int, str]]]
110106
:param str sep: separator between the two keys (default: ":")
111107
@@ -189,7 +185,7 @@ def preprocessing(txt: str, remove_space: bool = True) -> str:
189185

190186
def compute_stats(
191187
ref_sample: str, raw_sample: str
192-
) -> TokenizationStats:
188+
) -> TokenizationStat:
193189
"""Compute statistics for tokenization quality
194190
195191
These statistics include:
@@ -206,7 +202,7 @@ def compute_stats(
206202
:param str samples: samples that we want to evaluate
207203
208204
:return: metrics at character- and word-level and indicators of correctly tokenized words
209-
:rtype: TokenizationStats
205+
:rtype: TokenizationStat
210206
"""
211207
import numpy as np
212208

@@ -244,18 +240,18 @@ def compute_stats(
244240
tokenization_indicators_str = list(map(str, tokenization_indicators))
245241

246242
return {
247-
"char_level": CharLevelStats(
243+
"char_level": CharLevelStat(
248244
tp=c_tp,
249245
fp=c_fp,
250246
tn=c_tn,
251247
fn=c_fn,
252248
),
253-
"word_level": WordLevelStats(
249+
"word_level": WordLevelStat(
254250
correctly_tokenised_words=correctly_tokenised_words,
255251
total_words_in_sample=int(np.sum(sample_arr)),
256252
total_words_in_ref_sample=int(np.sum(ref_sample_arr)),
257253
),
258-
"global": GlobalStats(
254+
"global_": GlobalStat(
259255
tokenisation_indicators="".join(tokenization_indicators_str),
260256
),
261257
}

tests/extra/testx_benchmarks.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99

1010
from pythainlp.benchmarks import (
1111
BleuScore,
12-
CharLevelStats,
13-
GlobalStats,
12+
CharLevelStat,
13+
GlobalStat,
1414
RougeScore,
15-
TokenizationStats,
16-
WordLevelStats,
15+
TokenizationStat,
16+
WordLevelStat,
1717
bleu_score,
1818
rouge_score,
1919
word_tokenization,
@@ -59,31 +59,31 @@ def test_compute_stats(self):
5959
self.assertIsNotNone(result)
6060

6161
def test_compute_stats_return_type(self):
62-
"""Test that compute_stats returns a TokenizationStats typed dict."""
62+
"""Test that compute_stats returns a TokenizationStat typed dict."""
6363
ref = word_tokenization.preprocessing("อากาศ|ร้อน|มาก")
6464
act = word_tokenization.preprocessing("อากาศ|ร้อนมาก")
6565

66-
result: TokenizationStats = word_tokenization.compute_stats(ref, act)
66+
result: TokenizationStat = word_tokenization.compute_stats(ref, act)
6767

6868
self.assertIsInstance(result, dict)
6969
self.assertIn("char_level", result)
7070
self.assertIn("word_level", result)
71-
self.assertIn("global", result)
71+
self.assertIn("global_", result)
7272

73-
char: CharLevelStats = result["char_level"]
73+
char: CharLevelStat = result["char_level"]
7474
self.assertIsInstance(char, dict)
7575
self.assertIsInstance(char["tp"], int)
7676
self.assertIsInstance(char["fp"], int)
7777
self.assertIsInstance(char["tn"], int)
7878
self.assertIsInstance(char["fn"], int)
7979

80-
word: WordLevelStats = result["word_level"]
80+
word: WordLevelStat = result["word_level"]
8181
self.assertIsInstance(word, dict)
8282
self.assertIsInstance(word["correctly_tokenised_words"], int)
8383
self.assertIsInstance(word["total_words_in_sample"], int)
8484
self.assertIsInstance(word["total_words_in_ref_sample"], int)
8585

86-
glob: GlobalStats = result["global"]
86+
glob: GlobalStat = result["global_"]
8787
self.assertIsInstance(glob, dict)
8888
self.assertIsInstance(glob["tokenisation_indicators"], str)
8989

0 commit comments

Comments
 (0)