Skip to content

Commit 6e39b76

Browse files
VemundHwjakob
authored andcommitted
Add C++20 char8_t/u8string support (#2026)
* Fix test build in C++20 * Add C++20 char8_t/u8string support
1 parent 37d04ab commit 6e39b76

File tree

3 files changed

+72
-5
lines changed

3 files changed

+72
-5
lines changed

β€Žinclude/pybind11/cast.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
#include <string_view>
3333
#endif
3434

35+
#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
36+
# define PYBIND11_HAS_U8STRING
37+
#endif
38+
3539
NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
3640
NAMESPACE_BEGIN(detail)
3741

@@ -988,6 +992,9 @@ template <typename type> class type_caster<std::reference_wrapper<type>> {
988992

989993
template <typename CharT> using is_std_char_type = any_of<
990994
std::is_same<CharT, char>, /* std::string */
995+
#if defined(PYBIND11_HAS_U8STRING)
996+
std::is_same<CharT, char8_t>, /* std::u8string */
997+
#endif
991998
std::is_same<CharT, char16_t>, /* std::u16string */
992999
std::is_same<CharT, char32_t>, /* std::u32string */
9931000
std::is_same<CharT, wchar_t> /* std::wstring */
@@ -1191,6 +1198,9 @@ template <typename StringType, bool IsView = false> struct string_caster {
11911198
// Simplify life by being able to assume standard char sizes (the standard only guarantees
11921199
// minimums, but Python requires exact sizes)
11931200
static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
1201+
#if defined(PYBIND11_HAS_U8STRING)
1202+
static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
1203+
#endif
11941204
static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
11951205
static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
11961206
// wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
@@ -1209,7 +1219,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12091219
#if PY_MAJOR_VERSION >= 3
12101220
return load_bytes(load_src);
12111221
#else
1212-
if (sizeof(CharT) == 1) {
1222+
if (std::is_same<CharT, char>::value) {
12131223
return load_bytes(load_src);
12141224
}
12151225

@@ -1269,7 +1279,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12691279
// without any encoding/decoding attempt). For other C++ char sizes this is a no-op.
12701280
// which supports loading a unicode from a str, doesn't take this path.
12711281
template <typename C = CharT>
1272-
bool load_bytes(enable_if_t<sizeof(C) == 1, handle> src) {
1282+
bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
12731283
if (PYBIND11_BYTES_CHECK(src.ptr())) {
12741284
// We were passed a Python 3 raw bytes; accept it into a std::string or char*
12751285
// without any encoding attempt.
@@ -1284,7 +1294,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
12841294
}
12851295

12861296
template <typename C = CharT>
1287-
bool load_bytes(enable_if_t<sizeof(C) != 1, handle>) { return false; }
1297+
bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
12881298
};
12891299

12901300
template <typename CharT, class Traits, class Allocator>

β€Žtests/test_builtin_casters.cpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ TEST_SUBMODULE(builtin_casters, m) {
3030
else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
3131
wstr.push_back(0x7a); // z
3232

33-
m.def("good_utf8_string", []() { return std::string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8β€½ πŸŽ‚ 𝐀
33+
m.def("good_utf8_string", []() { return std::string((const char*)u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8β€½ πŸŽ‚ 𝐀
3434
m.def("good_utf16_string", [=]() { return std::u16string({ b16, ib16, cake16_1, cake16_2, mathbfA16_1, mathbfA16_2, z16 }); }); // bβ€½πŸŽ‚π€z
3535
m.def("good_utf32_string", [=]() { return std::u32string({ a32, mathbfA32, cake32, ib32, z32 }); }); // aπ€πŸŽ‚β€½z
3636
m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
@@ -60,6 +60,18 @@ TEST_SUBMODULE(builtin_casters, m) {
6060
m.def("strlen", [](char *s) { return strlen(s); });
6161
m.def("string_length", [](std::string s) { return s.length(); });
6262

63+
#ifdef PYBIND11_HAS_U8STRING
64+
m.attr("has_u8string") = true;
65+
m.def("good_utf8_u8string", []() { return std::u8string(u8"Say utf8\u203d \U0001f382 \U0001d400"); }); // Say utf8β€½ πŸŽ‚ 𝐀
66+
m.def("bad_utf8_u8string", []() { return std::u8string((const char8_t*)"abc\xd0" "def"); });
67+
68+
m.def("u8_char8_Z", []() -> char8_t { return u8'Z'; });
69+
70+
// test_single_char_arguments
71+
m.def("ord_char8", [](char8_t c) -> int { return static_cast<unsigned char>(c); });
72+
m.def("ord_char8_lv", [](char8_t &c) -> int { return static_cast<unsigned char>(c); });
73+
#endif
74+
6375
// test_string_view
6476
#ifdef PYBIND11_HAS_STRING_VIEW
6577
m.attr("has_string_view") = true;
@@ -69,9 +81,15 @@ TEST_SUBMODULE(builtin_casters, m) {
6981
m.def("string_view_chars", [](std::string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
7082
m.def("string_view16_chars", [](std::u16string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
7183
m.def("string_view32_chars", [](std::u32string_view s) { py::list l; for (auto c : s) l.append((int) c); return l; });
72-
m.def("string_view_return", []() { return std::string_view(u8"utf8 secret \U0001f382"); });
84+
m.def("string_view_return", []() { return std::string_view((const char*)u8"utf8 secret \U0001f382"); });
7385
m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
7486
m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
87+
88+
# ifdef PYBIND11_HAS_U8STRING
89+
m.def("string_view8_print", [](std::u8string_view s) { py::print(s, s.size()); });
90+
m.def("string_view8_chars", [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
91+
m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
92+
# endif
7593
#endif
7694

7795
// test_integer_casting

β€Žtests/test_builtin_casters.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ def test_unicode_conversion():
1515
assert m.good_utf16_string() == u"bβ€½πŸŽ‚π€z"
1616
assert m.good_utf32_string() == u"aπ€πŸŽ‚β€½z"
1717
assert m.good_wchar_string() == u"aβΈ˜π€z"
18+
if hasattr(m, "has_u8string"):
19+
assert m.good_utf8_u8string() == u"Say utf8β€½ πŸŽ‚ 𝐀"
1820

1921
with pytest.raises(UnicodeDecodeError):
2022
m.bad_utf8_string()
@@ -29,12 +31,17 @@ def test_unicode_conversion():
2931
if hasattr(m, "bad_wchar_string"):
3032
with pytest.raises(UnicodeDecodeError):
3133
m.bad_wchar_string()
34+
if hasattr(m, "has_u8string"):
35+
with pytest.raises(UnicodeDecodeError):
36+
m.bad_utf8_u8string()
3237

3338
assert m.u8_Z() == 'Z'
3439
assert m.u8_eacute() == u'Γ©'
3540
assert m.u16_ibang() == u'β€½'
3641
assert m.u32_mathbfA() == u'𝐀'
3742
assert m.wchar_heart() == u'β™₯'
43+
if hasattr(m, "has_u8string"):
44+
assert m.u8_char8_Z() == 'Z'
3845

3946

4047
def test_single_char_arguments():
@@ -92,6 +99,17 @@ def toobig_message(r):
9299
assert m.ord_wchar(u'aa')
93100
assert str(excinfo.value) == toolong_message
94101

102+
if hasattr(m, "has_u8string"):
103+
assert m.ord_char8(u'a') == 0x61 # simple ASCII
104+
assert m.ord_char8_lv(u'b') == 0x62
105+
assert m.ord_char8(u'Γ©') == 0xE9 # requires 2 bytes in utf-8, but can be stuffed in a char
106+
with pytest.raises(ValueError) as excinfo:
107+
assert m.ord_char8(u'Δ€') == 0x100 # requires 2 bytes, doesn't fit in a char
108+
assert str(excinfo.value) == toobig_message(0x100)
109+
with pytest.raises(ValueError) as excinfo:
110+
assert m.ord_char8(u'ab')
111+
assert str(excinfo.value) == toolong_message
112+
95113

96114
def test_bytes_to_string():
97115
"""Tests the ability to pass bytes to C++ string-accepting functions. Note that this is
@@ -116,10 +134,15 @@ def test_string_view(capture):
116134
assert m.string_view_chars("Hi πŸŽ‚") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
117135
assert m.string_view16_chars("Hi πŸŽ‚") == [72, 105, 32, 0xd83c, 0xdf82]
118136
assert m.string_view32_chars("Hi πŸŽ‚") == [72, 105, 32, 127874]
137+
if hasattr(m, "has_u8string"):
138+
assert m.string_view8_chars("Hi") == [72, 105]
139+
assert m.string_view8_chars("Hi πŸŽ‚") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
119140

120141
assert m.string_view_return() == "utf8 secret πŸŽ‚"
121142
assert m.string_view16_return() == "utf16 secret πŸŽ‚"
122143
assert m.string_view32_return() == "utf32 secret πŸŽ‚"
144+
if hasattr(m, "has_u8string"):
145+
assert m.string_view8_return() == "utf8 secret πŸŽ‚"
123146

124147
with capture:
125148
m.string_view_print("Hi")
@@ -132,6 +155,14 @@ def test_string_view(capture):
132155
utf16 πŸŽ‚ 8
133156
utf32 πŸŽ‚ 7
134157
"""
158+
if hasattr(m, "has_u8string"):
159+
with capture:
160+
m.string_view8_print("Hi")
161+
m.string_view8_print("utf8 πŸŽ‚")
162+
assert capture == """
163+
Hi 2
164+
utf8 πŸŽ‚ 9
165+
"""
135166

136167
with capture:
137168
m.string_view_print("Hi, ascii")
@@ -144,6 +175,14 @@ def test_string_view(capture):
144175
Hi, utf16 πŸŽ‚ 12
145176
Hi, utf32 πŸŽ‚ 11
146177
"""
178+
if hasattr(m, "has_u8string"):
179+
with capture:
180+
m.string_view8_print("Hi, ascii")
181+
m.string_view8_print("Hi, utf8 πŸŽ‚")
182+
assert capture == """
183+
Hi, ascii 9
184+
Hi, utf8 πŸŽ‚ 13
185+
"""
147186

148187

149188
def test_integer_casting():

0 commit comments

Comments
Β (0)