Skip to content

Commit 35ad3c5

Browse files
committed
gccrs: Normalize Hangul to NFC
gcc/rust/ChangeLog: * util/rust-unicode.cc (decomp_cano):decompose Hangul. (sort_cano):bugfix. (recomp):compose Hangul. (compose_hangul):New function. (dump_string):removed. (rust_utf8_normalize_test):Add tests. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 86bfc84 commit 35ad3c5

File tree

1 file changed

+118
-43
lines changed

1 file changed

+118
-43
lines changed

gcc/rust/util/rust-unicode.cc

Lines changed: 118 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ namespace Rust {
99
typedef uint32_t codepoint_t;
1010
typedef std::vector<codepoint_t> string_t;
1111

12+
// These constants are used to compose and decompose of Hangul syllables.
13+
// See `Sample Code for Hangul Algorithms` in 3.1.2
14+
// unicode.org/versions/Unicode15.0.0/ch03.pdf
15+
const uint32_t S_BASE = 0xAC00;
16+
const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
17+
const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
18+
const uint32_t N_COUNT = V_COUNT * T_COUNT;
19+
const uint32_t S_COUNT = L_COUNT * N_COUNT;
20+
1221
template <std::size_t SIZE>
1322
int64_t
1423
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115124
string_t
116125
decomp_cano (string_t s)
117126
{
118-
// TODO: Algorithmic lookup for Hangul
119127
string_t buf;
120128
for (codepoint_t c : s)
121-
recursive_decomp_cano (c, buf);
129+
{
130+
int64_t s_index = c - S_BASE;
131+
if (0 <= s_index && s_index < S_COUNT)
132+
{
133+
// decompose Hangul argorithmically
134+
uint32_t l = L_BASE + s_index / N_COUNT;
135+
uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136+
uint32_t t = T_BASE + s_index % T_COUNT;
137+
buf.push_back (l);
138+
buf.push_back (v);
139+
if (t != T_BASE)
140+
buf.push_back (t);
141+
continue;
142+
}
143+
144+
// Current character is not hangul
145+
recursive_decomp_cano (c, buf);
146+
}
122147
return buf;
123148
}
124149

@@ -132,7 +157,7 @@ sort_cano (string_t &s)
132157
{
133158
cc_here = lookup_cc (s[i]);
134159
cc_prev = lookup_cc (s[i - 1]);
135-
if (cc_here >= 0 && cc_prev > cc_here)
160+
if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
136161
{
137162
// swap
138163
int tmp = s[i];
@@ -145,63 +170,103 @@ sort_cano (string_t &s)
145170
}
146171

147172
string_t
148-
recomp (string_t s)
173+
compose_hangul (string_t s)
149174
{
150-
// TODO: Algorithmic lookup for Hangul
151175
string_t buf;
152-
if (s.size () > 0)
176+
if (s.size () < 2)
177+
return s;
178+
179+
codepoint_t last = s[0];
180+
buf.push_back (last);
181+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
153182
{
154-
int last_class = -1;
155-
// int starter_pos = 0; // Assume the first character is Starter. Correct?
156-
// int target_pos = 1;
157-
codepoint_t starter_ch = s[0];
158-
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
183+
codepoint_t ch = s[src_pos];
184+
185+
// L V => LV
186+
int64_t l_index = last - L_BASE;
187+
if (0 <= l_index && l_index < L_COUNT)
159188
{
160-
// get current character
161-
codepoint_t ch = s[src_pos];
162-
int ch_class = lookup_cc (ch);
163-
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
164-
if (composite.has_value () && last_class < ch_class)
165-
{
166-
// ch can be composed
167-
buf.push_back (composite.value ());
168-
starter_ch = composite.value ();
169-
}
170-
else if (ch_class == 0)
189+
int64_t v_index = ch - V_BASE;
190+
if (0 <= v_index && v_index < V_COUNT)
171191
{
172-
// ch is Starter and cannot be composed.
173-
if (src_pos == 1)
174-
// FIXME: buggy?
175-
buf.push_back (starter_ch);
176-
// starter_pos = target_pos;
177-
starter_ch = ch;
178-
last_class = -1;
179-
buf.push_back (ch);
192+
last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193+
// pop L
194+
buf.pop_back ();
195+
buf.push_back (last);
196+
continue;
180197
}
181-
else
198+
}
199+
200+
// LV T => LVT
201+
int64_t s_index = last - S_BASE;
202+
if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
203+
{
204+
int64_t t_index = ch - T_BASE;
205+
if (0 < t_index && t_index < T_COUNT)
182206
{
183-
// ch is not Starter.
184-
last_class = ch_class;
185-
buf.push_back (ch);
207+
last += t_index;
208+
// pop LV
209+
buf.pop_back ();
210+
buf.push_back (last);
211+
continue;
186212
}
187213
}
214+
last = ch;
215+
buf.push_back (last);
188216
}
189217
return buf;
190218
}
191219

192-
// TODO: remove
193-
/*
194-
void
195-
dump_string (std::vector<uint32_t> s)
220+
string_t
221+
recomp (string_t s)
196222
{
197-
std::cout << "dump=";
198-
for (auto c : s)
223+
// compose hangul first
224+
s = compose_hangul (s);
225+
226+
string_t buf;
227+
if (s.size () < 2)
228+
return s;
229+
230+
int last_class = -1;
231+
// int starter_pos = 0; // Assume the first character is Starter. Correct?
232+
// int target_pos = 1;
233+
codepoint_t starter_ch = s[0];
234+
235+
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
199236
{
200-
std::cout << std::hex << c << ", ";
237+
// get current character
238+
codepoint_t ch = s[src_pos];
239+
240+
int ch_class = lookup_cc (ch);
241+
tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
242+
if (composite.has_value () && last_class < ch_class)
243+
{
244+
// ch can be composed
245+
buf.push_back (composite.value ());
246+
starter_ch = composite.value ();
247+
}
248+
else if (ch_class == 0)
249+
{
250+
// ch is Starter and cannot be composed.
251+
if (src_pos == 1)
252+
// FIXME: buggy?
253+
buf.push_back (starter_ch);
254+
starter_ch = ch;
255+
last_class = -1;
256+
buf.push_back (ch);
257+
}
258+
else
259+
{
260+
if (src_pos == 1)
261+
// FIXME: buggy?
262+
buf.push_back (starter_ch);
263+
// ch is not Starter.
264+
last_class = ch_class;
265+
buf.push_back (ch);
266+
}
201267
}
202-
std::cout << std::endl;
268+
return buf;
203269
}
204-
*/
205270

206271
string_t
207272
nfc_normalize (string_t s)
@@ -269,6 +334,16 @@ rust_utf8_normalize_test ()
269334
assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
270335
assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
271336

337+
// testcases for Hangul from Part0
338+
assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
339+
assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
340+
// testcases for Hangul from Part1
341+
assert_normalize ({0x3131}, {0x3131});
342+
assert_normalize ({0x3132}, {0x3132});
343+
// testcases for Hangul from Part3
344+
assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
345+
assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
346+
272347
// TODO: add more testcases in
273348
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
274349
}

0 commit comments

Comments
 (0)