Skip to content

Commit edc6b75

Browse files
committed
Merge branch 'PHP-8.1'
* PHP-8.1: mb_convert_encoding will not auto-detect input string as UUEncode, Base64, QPrint
2 parents d4920f4 + f07c193 commit edc6b75

File tree

2 files changed

+29
-32
lines changed

2 files changed

+29
-32
lines changed

ext/mbstring/mbstring.c

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2501,6 +2501,23 @@ MBSTRING_API HashTable *php_mb_convert_encoding_recursive(HashTable *input, cons
25012501
}
25022502
/* }}} */
25032503

2504+
static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2505+
{
2506+
/* mbstring supports some 'text encodings' which aren't really text encodings
2507+
* at all, but really 'byte encodings', like Base64, QPrint, and so on.
2508+
* These should never be returned by `mb_detect_encoding`. */
2509+
int shift = 0;
2510+
for (int i = 0; i < *size; i++) {
2511+
const mbfl_encoding *encoding = elist[i];
2512+
if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2513+
shift++; /* Remove this encoding from the list */
2514+
} else if (shift) {
2515+
elist[i - shift] = encoding;
2516+
}
2517+
}
2518+
*size -= shift;
2519+
}
2520+
25042521
/* {{{ Returns converted string in desired encoding */
25052522
PHP_FUNCTION(mb_convert_encoding)
25062523
{
@@ -2541,6 +2558,10 @@ PHP_FUNCTION(mb_convert_encoding)
25412558
free_from_encodings = 0;
25422559
}
25432560

2561+
if (num_from_encodings > 1) {
2562+
remove_non_encodings_from_elist(from_encodings, &num_from_encodings);
2563+
}
2564+
25442565
if (!num_from_encodings) {
25452566
efree(ZEND_VOIDP(from_encodings));
25462567
zend_argument_value_error(3, "must specify at least one encoding");
@@ -2674,23 +2695,6 @@ PHP_FUNCTION(mb_strtolower)
26742695
}
26752696
/* }}} */
26762697

2677-
static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
2678-
{
2679-
/* mbstring supports some 'text encodings' which aren't really text encodings
2680-
* at all, but really 'byte encodings', like Base64, QPrint, and so on.
2681-
* These should never be returned by `mb_detect_encoding`. */
2682-
int shift = 0;
2683-
for (int i = 0; i < *size; i++) {
2684-
const mbfl_encoding *encoding = elist[i];
2685-
if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
2686-
shift++; /* Remove this encoding from the list */
2687-
} else if (shift) {
2688-
elist[i - shift] = encoding;
2689-
}
2690-
}
2691-
*size -= shift;
2692-
}
2693-
26942698
/* {{{ Encodings of the given string is returned (as a string) */
26952699
PHP_FUNCTION(mb_detect_encoding)
26962700
{

ext/mbstring/tests/mb_convert_encoding.phpt

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,24 @@ mbstring.language=Japanese
99
<?php
1010
// TODO: Add more tests
1111

12-
// SJIS string (BASE64 encoded)
1312
$sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
14-
// JIS string (BASE64 encoded)
1513
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
16-
// EUC-JP string
1714
$euc_jp = "\xC6\xFC\xCB\xDC\xB8\xEC\xA5\xC6\xA5\xAD\xA5\xB9\xA5\xC8\xA4\xC7\xA4\xB9\xA1\xA301234\xA3\xB5\xA3\xB6\xA3\xB7\xA3\xB8\xA3\xB9\xA1\xA3";
1815

1916
// Test with single "form encoding"
20-
// Note: For some reason it complains, results are different. Not researched.
2117
echo "== BASIC TEST ==\n";
22-
$s = $sjis;
23-
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', 'SJIS'));
18+
$s = bin2hex(mb_convert_encoding($sjis, 'EUC-JP', 'SJIS'));
2419
print("EUC-JP: $s\n"); // EUC-JP
2520

26-
$s = $jis;
27-
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', 'JIS'));
21+
$s = bin2hex(mb_convert_encoding($jis, 'EUC-JP', 'JIS'));
2822
print("EUC-JP: $s\n"); // EUC-JP
2923

30-
$s = $euc_jp;
31-
$s = mb_convert_encoding($s, 'SJIS', 'EUC-JP');
24+
$s = mb_convert_encoding($euc_jp, 'SJIS', 'EUC-JP');
3225
print("SJIS: ".base64_encode($s)."\n"); // SJIS
3326

34-
$s = $euc_jp;
35-
$s = mb_convert_encoding($s, 'JIS', 'EUC-JP');
27+
$s = mb_convert_encoding($euc_jp, 'JIS', 'EUC-JP');
3628
print("JIS: ".base64_encode($s)."\n"); // JIS
3729

38-
3930
// Using Encoding List Array
4031
echo "== STRING ENCODING LIST ==\n";
4132

@@ -52,11 +43,10 @@ $s = $euc_jp;
5243
$s = mb_convert_encoding($s, 'JIS', $a);
5344
print("JIS: ".base64_encode($s)."\n"); // JIS
5445

55-
5646
// Using Encoding List Array
5747
echo "== ARRAY ENCODING LIST ==\n";
5848

59-
$a = array(0=>'JIS', 1=>'UTF-8', 2=>'EUC-JP', 3=>'SJIS');
49+
$a = ['JIS', 'UTF-8', 'EUC-JP', 'SJIS'];
6050
$s = $jis;
6151
$s = bin2hex(mb_convert_encoding($s, 'EUC-JP', $a));
6252
print("EUC-JP: $s\n"); // EUC-JP
@@ -69,6 +59,8 @@ $s = $euc_jp;
6959
$s = mb_convert_encoding($s, 'JIS', $a);
7060
print("JIS: ".base64_encode($s)."\n"); // JIS
7161

62+
// Regression test for bug #81676
63+
echo "UTF-8: " . mb_convert_encoding('test', 'UTF-8', mb_list_encodings()), "\n";
7264

7365
// Using Detect Order
7466
echo "== DETECT ORDER ==\n";
@@ -122,6 +114,7 @@ JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
122114
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
123115
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
124116
JIS: GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
117+
UTF-8: test
125118
== DETECT ORDER ==
126119
EUC-JP: c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
127120
SJIS: k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==

0 commit comments

Comments
 (0)