Skip to content

[libc] Wchar Stringconverter #146388

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions libc/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ add_header_library(
libc.hdr.types.char32_t
)

add_header_library(
string_converter
HDRS
string_converter.h
DEPENDS
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.hdr.types.size_t
libc.src.__support.error_or
.mbstate
.character_converter
)

add_object_library(
character_converter
HDRS
Expand All @@ -16,6 +29,7 @@ add_object_library(
libc.hdr.errno_macros
libc.hdr.types.char8_t
libc.hdr.types.char32_t
libc.hdr.types.size_t
libc.src.__support.error_or
libc.src.__support.math_extras
.mbstate
Expand Down
11 changes: 11 additions & 0 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "hdr/errno_macros.h"
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/CPP/bit.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
Expand Down Expand Up @@ -92,6 +93,7 @@ int CharacterConverter::push(char8_t utf8_byte) {
state->bytes_stored++;
return 0;
}

// Invalid byte -> reset the state
clear();
return EILSEQ;
Expand Down Expand Up @@ -130,6 +132,12 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
return utf32;
}

size_t CharacterConverter::sizeAsUTF32() {
return 1; // a single utf-32 value can fit an entire character
}

size_t CharacterConverter::sizeAsUTF8() { return state->total_bytes; }

ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isEmpty())
return Error(-1);
Expand All @@ -156,6 +164,9 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
}

state->bytes_stored--;
if (state->bytes_stored == 0)
clear();

return static_cast<char8_t>(output);
}

Expand Down
4 changes: 4 additions & 0 deletions libc/src/__support/wchar/character_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/mbstate.h"
Expand All @@ -30,6 +31,9 @@ class CharacterConverter {
bool isEmpty();
bool isValidState();

size_t sizeAsUTF32();
size_t sizeAsUTF8();

int push(char8_t utf8_byte);
int push(char32_t utf32);

Expand Down
119 changes: 119 additions & 0 deletions libc/src/__support/wchar/string_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
#define LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H

#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
#include "hdr/types/size_t.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/wchar/character_converter.h"
#include "src/__support/wchar/mbstate.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {

template <typename T> class StringConverter {
private:
CharacterConverter cr;
const T *src;
size_t src_len;
size_t src_idx;

// # of src elements pushed to cr needed to represent the current character
size_t num_pushed;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

num_pushed is effectively a second return from pushFullCharacter. It should be returned as part of a struct and not stored.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do still need to store num_pushed as the call to popUTFx() that actually pushes the full character might not be the same call that updates the src_idx

Ex for a 4 byte character:

str_conv.pop_utf8() // pushes the full utf-32 character
str_conv.pop_utf8()
str_conv.pop_utf8()
str_conv.pop_utf8() // updates src index

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is it necessary to update the src index in a different call to pop than the one that pushes? To me it makes more sense to increment the src index to represent that a character has been pushed and the rest of it is in the mbstate.


// # of pops we are allowed to perform (essentially size of the dest buffer)
size_t num_to_write;

int pushFullCharacter() {
for (num_pushed = 0; !cr.isFull() && src_idx + num_pushed < src_len;
++num_pushed) {
int err = cr.push(src[src_idx + num_pushed]);
if (err != 0)
return err;
}

// if we aren't able to read a full character from the source string
if (src_idx + num_pushed == src_len && !cr.isFull()) {
src_idx += num_pushed;
return -1;
}

return 0;
}

public:
StringConverter(const T *s, size_t srclen, size_t dstlen, mbstate *ps)
: cr(ps), src(s), src_len(srclen), src_idx(0), num_pushed(0),
num_to_write(dstlen) {
pushFullCharacter();
}

StringConverter(const T *s, size_t dstlen, mbstate *ps)
: StringConverter(s, SIZE_MAX, dstlen, ps) {}
Comment on lines +54 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be better to move srclen to the end and make it a parameter with a default value instead of this?


// TODO: following functions are almost identical
// look into templating CharacterConverter pop functions
ErrorOr<char32_t> popUTF32() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

if (cr.sizeAsUTF32() > num_to_write) {
cr.clear();
return Error(-1);
}
}

auto out = cr.pop_utf32();
if (cr.isEmpty())
src_idx += num_pushed;

if (out.has_value() && out.value() == L'\0')
src_len = src_idx;

num_to_write--;

return out;
}

ErrorOr<char8_t> popUTF8() {
if (cr.isEmpty()) {
int err = pushFullCharacter();
if (err != 0)
return Error(err);

if (cr.sizeAsUTF8() > num_to_write) {
cr.clear();
return Error(-1);
}
}

auto out = cr.pop_utf8();
if (cr.isEmpty())
src_idx += num_pushed;

if (out.has_value() && out.value() == '\0')
src_len = src_idx;

num_to_write--;

return out;
}

size_t getSourceIndex() { return src_idx; }
};

} // namespace internal
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC___SUPPORT_STRING_CONVERTER_H
14 changes: 14 additions & 0 deletions libc/test/src/__support/wchar/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,17 @@ add_libc_test(
DEPENDS
libc.src.__support.wchar.character_converter
)

add_libc_test(
string_converter_test.cpp
SUITE
libc-support-tests
SRCS
string_converter_test.cpp
DEPENDS
libc.src.__support.wchar.string_converter
libc.src.__support.wchar.mbstate
libc.src.__support.error_or
libc.hdr.errno_macros
libc.hdr.types.char32_t
)
Loading
Loading