Skip to content

Commit 3c73225

Browse files
committed
New internal interface for fast text conversion in mbstring
When converting text to/from wchars, mbstring makes one function call for each and every byte or wchar to be converted. Typically, each of these conversion functions contains a state machine, and its state has to be restored and then saved for every single one of these calls. It doesn't take much to see that this is grossly inefficient. Instead of converting one byte or wchar on each call, the new conversion functions will either fill up or drain a whole buffer of wchars on each call. In benchmarks, this is about 3-10× faster. Adding the new, faster conversion functions for all supported legacy text encodings still needs some work. Also, all the code which uses the old-style conversion functions needs to be converted to use the new ones. After that, the old code can be dropped. (The mailparse extension will also have to be fixed up so it will still compile.)
1 parent 0b5d371 commit 3c73225

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2436
-414
lines changed

ext/mbstring/libmbfl/filters/mbfilter_7bit.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
3939
NULL,
4040
MBFL_ENCTYPE_SBCS,
4141
NULL,
42+
NULL,
43+
NULL,
4244
NULL
4345
};
4446

ext/mbstring/libmbfl/filters/mbfilter_base64.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
3939
NULL,
4040
MBFL_ENCTYPE_GL_UNSAFE,
4141
NULL,
42+
NULL,
43+
NULL,
4244
NULL
4345
};
4446

ext/mbstring/libmbfl/filters/mbfilter_big5.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ const mbfl_encoding mbfl_encoding_big5 = {
6363
mblen_table_big5,
6464
MBFL_ENCTYPE_GL_UNSAFE,
6565
&vtbl_big5_wchar,
66-
&vtbl_wchar_big5
66+
&vtbl_wchar_big5,
67+
NULL,
68+
NULL
6769
};
6870

6971
const mbfl_encoding mbfl_encoding_cp950 = {
@@ -74,7 +76,9 @@ const mbfl_encoding mbfl_encoding_cp950 = {
7476
mblen_table_big5,
7577
MBFL_ENCTYPE_GL_UNSAFE,
7678
&vtbl_cp950_wchar,
77-
&vtbl_wchar_cp950
79+
&vtbl_wchar_cp950,
80+
NULL,
81+
NULL
7882
};
7983

8084
const struct mbfl_convert_vtbl vtbl_big5_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
5555
NULL,
5656
MBFL_ENCTYPE_GL_UNSAFE,
5757
&vtbl_cp50220_wchar,
58-
&vtbl_wchar_cp50220
58+
&vtbl_wchar_cp50220,
59+
NULL,
60+
NULL
5961
};
6062

6163
const mbfl_encoding mbfl_encoding_cp50221 = {
@@ -66,7 +68,9 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
6668
NULL,
6769
MBFL_ENCTYPE_GL_UNSAFE,
6870
&vtbl_cp50221_wchar,
69-
&vtbl_wchar_cp50221
71+
&vtbl_wchar_cp50221,
72+
NULL,
73+
NULL
7074
};
7175

7276
const mbfl_encoding mbfl_encoding_cp50222 = {
@@ -77,7 +81,9 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
7781
NULL,
7882
MBFL_ENCTYPE_GL_UNSAFE,
7983
&vtbl_cp50222_wchar,
80-
&vtbl_wchar_cp50222
84+
&vtbl_wchar_cp50222,
85+
NULL,
86+
NULL
8187
};
8288

8389
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_cp51932.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
6565
mblen_table_eucjp,
6666
0,
6767
&vtbl_cp51932_wchar,
68-
&vtbl_wchar_cp51932
68+
&vtbl_wchar_cp51932,
69+
NULL,
70+
NULL
6971
};
7072

7173
const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_cp932.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ const mbfl_encoding mbfl_encoding_cp932 = {
6464
mblen_table_sjis,
6565
MBFL_ENCTYPE_GL_UNSAFE,
6666
&vtbl_cp932_wchar,
67-
&vtbl_wchar_cp932
67+
&vtbl_wchar_cp932,
68+
NULL,
69+
NULL
6870
};
6971

7072
const struct mbfl_convert_vtbl vtbl_cp932_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_cp936.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ const mbfl_encoding mbfl_encoding_cp936 = {
6363
mblen_table_cp936,
6464
MBFL_ENCTYPE_GL_UNSAFE,
6565
&vtbl_cp936_wchar,
66-
&vtbl_wchar_cp936
66+
&vtbl_wchar_cp936,
67+
NULL,
68+
NULL
6769
};
6870

6971
const struct mbfl_convert_vtbl vtbl_cp936_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
6363
mblen_table_euccn,
6464
0,
6565
&vtbl_euccn_wchar,
66-
&vtbl_wchar_euccn
66+
&vtbl_wchar_euccn,
67+
NULL,
68+
NULL
6769
};
6870

6971
const struct mbfl_convert_vtbl vtbl_euccn_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c

Lines changed: 127 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include "unicode_table_jis.h"
3535

3636
static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter);
37+
static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
38+
static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
3739

3840
const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */
3941
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -64,7 +66,9 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
6466
mblen_table_eucjp,
6567
0,
6668
&vtbl_eucjp_wchar,
67-
&vtbl_wchar_eucjp
69+
&vtbl_wchar_eucjp,
70+
mb_eucjp_to_wchar,
71+
mb_wchar_to_eucjp
6872
};
6973

7074
const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
@@ -243,3 +247,125 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
243247

244248
return 0;
245249
}
250+
251+
static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
252+
{
253+
unsigned char *p = *in, *e = p + *in_len;
254+
uint32_t *out = buf, *limit = buf + bufsize;
255+
256+
while (p < e && out < limit) {
257+
unsigned char c = *p++;
258+
259+
if (c < 0x80) {
260+
*out++ = c;
261+
} else if (c >= 0xA1 && c <= 0xFE && p < e) {
262+
/* JISX 0208 */
263+
unsigned char c2 = *p++;
264+
if (c2 >= 0xA1 && c2 <= 0xFE) {
265+
unsigned int s = (c - 0xA1)*94 + c2 - 0xA1;
266+
if (s < jisx0208_ucs_table_size) {
267+
uint32_t w = jisx0208_ucs_table[s];
268+
if (!w)
269+
w = MBFL_BAD_INPUT;
270+
*out++ = w;
271+
} else {
272+
*out++ = MBFL_BAD_INPUT;
273+
}
274+
} else {
275+
*out++ = MBFL_BAD_INPUT;
276+
}
277+
} else if (c == 0x8E && p < e) {
278+
/* Kana */
279+
unsigned char c2 = *p++;
280+
*out++ = (c2 >= 0xA1 && c2 <= 0xDF) ? 0xFEC0 + c2 : MBFL_BAD_INPUT;
281+
} else if (c == 0x8F) {
282+
/* JISX 0212 */
283+
if ((e - p) >= 2) {
284+
unsigned char c2 = *p++;
285+
unsigned char c3 = *p++;
286+
if (c3 >= 0xA1 && c3 <= 0xFE && c2 >= 0xA1 && c2 <= 0xFE) {
287+
unsigned int s = (c2 - 0xA1)*94 + c3 - 0xA1;
288+
if (s < jisx0212_ucs_table_size) {
289+
uint32_t w = jisx0212_ucs_table[s];
290+
if (!w)
291+
w = MBFL_BAD_INPUT;
292+
*out++ = w;
293+
} else {
294+
*out++ = MBFL_BAD_INPUT;
295+
}
296+
} else {
297+
*out++ = MBFL_BAD_INPUT;
298+
}
299+
} else {
300+
*out++ = MBFL_BAD_INPUT;
301+
p = e; /* Jump to end of string */
302+
}
303+
} else {
304+
*out++ = MBFL_BAD_INPUT;
305+
}
306+
}
307+
308+
*in_len = e - p;
309+
*in = p;
310+
return out - buf;
311+
}
312+
313+
static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
314+
{
315+
unsigned char *out, *limit;
316+
MB_CONVERT_BUF_LOAD(buf, out, limit);
317+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
318+
319+
while (len--) {
320+
uint32_t w = *in++;
321+
unsigned int s = 0;
322+
323+
if (w == 0xAF) { /* U+00AF is MACRON */
324+
s = 0xA2B4; /* Use JIS X 0212 overline */
325+
} else if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
326+
s = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
327+
} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
328+
s = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
329+
} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
330+
s = ucs_i_jis_table[w - ucs_i_jis_table_min];
331+
} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
332+
s = ucs_r_jis_table[w - ucs_r_jis_table_min];
333+
}
334+
335+
if (s == 0) {
336+
if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
337+
s = 0x2140;
338+
} else if (w == 0x2225) { /* PARALLEL TO */
339+
s = 0x2142;
340+
} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
341+
s = 0x215D;
342+
} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
343+
s = 0x2171;
344+
} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
345+
s = 0x2172;
346+
} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
347+
s = 0x224C;
348+
} else if (w == 0) {
349+
out = mb_convert_buf_add(out, 0);
350+
continue;
351+
} else {
352+
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp);
353+
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
354+
continue;
355+
}
356+
}
357+
358+
if (s < 0x80) {
359+
out = mb_convert_buf_add(out, s);
360+
} else if (s < 0x100) {
361+
out = mb_convert_buf_add2(out, 0x8E, s);
362+
} else if (s < 0x8080) {
363+
out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
364+
} else {
365+
MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3);
366+
out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80);
367+
}
368+
}
369+
370+
MB_CONVERT_BUF_STORE(buf, out, limit);
371+
}

ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
4343
mblen_table_eucjp,
4444
0,
4545
&vtbl_eucjp2004_wchar,
46-
&vtbl_wchar_eucjp2004
46+
&vtbl_wchar_eucjp2004,
47+
NULL,
48+
NULL
4749
};
4850

4951
const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
6565
mblen_table_eucjp,
6666
0,
6767
&vtbl_eucjpwin_wchar,
68-
&vtbl_wchar_eucjpwin
68+
&vtbl_wchar_eucjpwin,
69+
NULL,
70+
NULL
6971
};
7072

7173
const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
6262
mblen_table_euckr,
6363
0,
6464
&vtbl_euckr_wchar,
65-
&vtbl_wchar_euckr
65+
&vtbl_wchar_euckr,
66+
NULL,
67+
NULL
6668
};
6769

6870
const struct mbfl_convert_vtbl vtbl_euckr_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
6464
mblen_table_euctw,
6565
0,
6666
&vtbl_euctw_wchar,
67-
&vtbl_wchar_euctw
67+
&vtbl_wchar_euctw,
68+
NULL,
69+
NULL
6870
};
6971

7072
const struct mbfl_convert_vtbl vtbl_euctw_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_gb18030.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
4545
NULL,
4646
MBFL_ENCTYPE_GL_UNSAFE,
4747
&vtbl_gb18030_wchar,
48-
&vtbl_wchar_gb18030
48+
&vtbl_wchar_gb18030,
49+
NULL,
50+
NULL
4951
};
5052

5153
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_htmlent.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ const mbfl_encoding mbfl_encoding_html_ent = {
6161
NULL,
6262
MBFL_ENCTYPE_GL_UNSAFE,
6363
&vtbl_html_wchar,
64-
&vtbl_wchar_html
64+
&vtbl_wchar_html,
65+
NULL,
66+
NULL
6567
};
6668

6769
const struct mbfl_convert_vtbl vtbl_wchar_html = {

ext/mbstring/libmbfl/filters/mbfilter_hz.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ const mbfl_encoding mbfl_encoding_hz = {
4343
NULL,
4444
MBFL_ENCTYPE_GL_UNSAFE,
4545
&vtbl_hz_wchar,
46-
&vtbl_wchar_hz
46+
&vtbl_wchar_hz,
47+
NULL,
48+
NULL
4749
};
4850

4951
const struct mbfl_convert_vtbl vtbl_hz_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
4646
NULL,
4747
MBFL_ENCTYPE_GL_UNSAFE,
4848
&vtbl_2022jpms_wchar,
49-
&vtbl_wchar_2022jpms
49+
&vtbl_wchar_2022jpms,
50+
NULL,
51+
NULL
5052
};
5153

5254
const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ const mbfl_encoding mbfl_encoding_2022kr = {
4747
NULL,
4848
MBFL_ENCTYPE_GL_UNSAFE,
4949
&vtbl_2022kr_wchar,
50-
&vtbl_wchar_2022kr
50+
&vtbl_wchar_2022kr,
51+
NULL,
52+
NULL
5153
};
5254

5355
const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {

ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
4343
NULL,
4444
MBFL_ENCTYPE_GL_UNSAFE,
4545
&vtbl_2022jp_2004_wchar,
46-
&vtbl_wchar_2022jp_2004
46+
&vtbl_wchar_2022jp_2004,
47+
NULL,
48+
NULL
4749
};
4850

4951
const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {

ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
4848
NULL,
4949
MBFL_ENCTYPE_GL_UNSAFE,
5050
&vtbl_2022jp_kddi_wchar,
51-
&vtbl_wchar_2022jp_kddi
51+
&vtbl_wchar_2022jp_kddi,
52+
NULL,
53+
NULL
5254
};
5355

5456
const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {

0 commit comments

Comments
 (0)