@@ -9,6 +9,15 @@ namespace Rust {
9
9
typedef uint32_t codepoint_t ;
10
10
typedef std::vector<codepoint_t > string_t ;
11
11
12
+ // These constants are used to compose and decompose of Hangul syllables.
13
+ // See `Sample Code for Hangul Algorithms` in 3.1.2
14
+ // unicode.org/versions/Unicode15.0.0/ch03.pdf
15
+ const uint32_t S_BASE = 0xAC00 ;
16
+ const uint32_t L_BASE = 0x1100 , V_BASE = 0x1161 , T_BASE = 0x11A7 ;
17
+ const uint32_t L_COUNT = 19 , V_COUNT = 21 , T_COUNT = 28 ;
18
+ const uint32_t N_COUNT = V_COUNT * T_COUNT;
19
+ const uint32_t S_COUNT = L_COUNT * N_COUNT;
20
+
12
21
template <std::size_t SIZE>
13
22
int64_t
14
23
binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
115
124
string_t
116
125
decomp_cano (string_t s)
117
126
{
118
- // TODO: Algorithmic lookup for Hangul
119
127
string_t buf;
120
128
for (codepoint_t c : s)
121
- recursive_decomp_cano (c, buf);
129
+ {
130
+ int64_t s_index = c - S_BASE;
131
+ if (0 <= s_index && s_index < S_COUNT)
132
+ {
133
+ // decompose Hangul argorithmically
134
+ uint32_t l = L_BASE + s_index / N_COUNT;
135
+ uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
136
+ uint32_t t = T_BASE + s_index % T_COUNT;
137
+ buf.push_back (l);
138
+ buf.push_back (v);
139
+ if (t != T_BASE)
140
+ buf.push_back (t);
141
+ continue ;
142
+ }
143
+
144
+ // Current character is not hangul
145
+ recursive_decomp_cano (c, buf);
146
+ }
122
147
return buf;
123
148
}
124
149
@@ -132,7 +157,7 @@ sort_cano (string_t &s)
132
157
{
133
158
cc_here = lookup_cc (s[i]);
134
159
cc_prev = lookup_cc (s[i - 1 ]);
135
- if (cc_here >= 0 && cc_prev > cc_here)
160
+ if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
136
161
{
137
162
// swap
138
163
int tmp = s[i];
@@ -145,63 +170,103 @@ sort_cano (string_t &s)
145
170
}
146
171
147
172
string_t
148
- recomp (string_t s)
173
+ compose_hangul (string_t s)
149
174
{
150
- // TODO: Algorithmic lookup for Hangul
151
175
string_t buf;
152
- if (s.size () > 0 )
176
+ if (s.size () < 2 )
177
+ return s;
178
+
179
+ codepoint_t last = s[0 ];
180
+ buf.push_back (last);
181
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
153
182
{
154
- int last_class = - 1 ;
155
- // int starter_pos = 0; // Assume the first character is Starter. Correct?
156
- // int target_pos = 1;
157
- codepoint_t starter_ch = s[ 0 ] ;
158
- for ( unsigned int src_pos = 1 ; src_pos < s. size (); src_pos++ )
183
+ codepoint_t ch = s[src_pos] ;
184
+
185
+ // L V => LV
186
+ int64_t l_index = last - L_BASE ;
187
+ if ( 0 <= l_index && l_index < L_COUNT )
159
188
{
160
- // get current character
161
- codepoint_t ch = s[src_pos];
162
- int ch_class = lookup_cc (ch);
163
- tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
164
- if (composite.has_value () && last_class < ch_class)
165
- {
166
- // ch can be composed
167
- buf.push_back (composite.value ());
168
- starter_ch = composite.value ();
169
- }
170
- else if (ch_class == 0 )
189
+ int64_t v_index = ch - V_BASE;
190
+ if (0 <= v_index && v_index < V_COUNT)
171
191
{
172
- // ch is Starter and cannot be composed.
173
- if (src_pos == 1 )
174
- // FIXME: buggy?
175
- buf.push_back (starter_ch);
176
- // starter_pos = target_pos;
177
- starter_ch = ch;
178
- last_class = -1 ;
179
- buf.push_back (ch);
192
+ last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
193
+ // pop L
194
+ buf.pop_back ();
195
+ buf.push_back (last);
196
+ continue ;
180
197
}
181
- else
198
+ }
199
+
200
+ // LV T => LVT
201
+ int64_t s_index = last - S_BASE;
202
+ if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0 )
203
+ {
204
+ int64_t t_index = ch - T_BASE;
205
+ if (0 < t_index && t_index < T_COUNT)
182
206
{
183
- // ch is not Starter.
184
- last_class = ch_class;
185
- buf.push_back (ch);
207
+ last += t_index;
208
+ // pop LV
209
+ buf.pop_back ();
210
+ buf.push_back (last);
211
+ continue ;
186
212
}
187
213
}
214
+ last = ch;
215
+ buf.push_back (last);
188
216
}
189
217
return buf;
190
218
}
191
219
192
- // TODO: remove
193
- /*
194
- void
195
- dump_string (std::vector<uint32_t> s)
220
+ string_t
221
+ recomp (string_t s)
196
222
{
197
- std::cout << "dump=";
198
- for (auto c : s)
223
+ // compose hangul first
224
+ s = compose_hangul (s);
225
+
226
+ string_t buf;
227
+ if (s.size () < 2 )
228
+ return s;
229
+
230
+ int last_class = -1 ;
231
+ // int starter_pos = 0; // Assume the first character is Starter. Correct?
232
+ // int target_pos = 1;
233
+ codepoint_t starter_ch = s[0 ];
234
+
235
+ for (unsigned int src_pos = 1 ; src_pos < s.size (); src_pos++)
199
236
{
200
- std::cout << std::hex << c << ", ";
237
+ // get current character
238
+ codepoint_t ch = s[src_pos];
239
+
240
+ int ch_class = lookup_cc (ch);
241
+ tl::optional<codepoint_t > composite = lookup_recomp (starter_ch, ch);
242
+ if (composite.has_value () && last_class < ch_class)
243
+ {
244
+ // ch can be composed
245
+ buf.push_back (composite.value ());
246
+ starter_ch = composite.value ();
247
+ }
248
+ else if (ch_class == 0 )
249
+ {
250
+ // ch is Starter and cannot be composed.
251
+ if (src_pos == 1 )
252
+ // FIXME: buggy?
253
+ buf.push_back (starter_ch);
254
+ starter_ch = ch;
255
+ last_class = -1 ;
256
+ buf.push_back (ch);
257
+ }
258
+ else
259
+ {
260
+ if (src_pos == 1 )
261
+ // FIXME: buggy?
262
+ buf.push_back (starter_ch);
263
+ // ch is not Starter.
264
+ last_class = ch_class;
265
+ buf.push_back (ch);
266
+ }
201
267
}
202
- std::cout << std::endl ;
268
+ return buf ;
203
269
}
204
- */
205
270
206
271
string_t
207
272
nfc_normalize (string_t s)
@@ -269,6 +334,16 @@ rust_utf8_normalize_test ()
269
334
assert_normalize ({0x1e0c , 0x0307 }, {0x1e0c , 0x0307 });
270
335
assert_normalize ({0x0044 , 0x0307 , 0x0323 }, {0x1e0c , 0x0307 });
271
336
337
+ // testcases for Hangul from Part0
338
+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 }, {0x1100 , 0xac01 });
339
+ assert_normalize ({0x1100 , 0xac00 , 0x11a8 , 0x11a8 }, {0x1100 , 0xac01 , 0x11a8 });
340
+ // testcases for Hangul from Part1
341
+ assert_normalize ({0x3131 }, {0x3131 });
342
+ assert_normalize ({0x3132 }, {0x3132 });
343
+ // testcases for Hangul from Part3
344
+ assert_normalize ({0x1100 , 0x0334 , 0x1161 }, {0x1100 , 0x0334 , 0x1161 });
345
+ assert_normalize ({0xac54 , 0x0334 , 0x11ae }, {0xac54 , 0x0334 , 0x11ae });
346
+
272
347
// TODO: add more testcases in
273
348
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
274
349
}
0 commit comments