From e84a200053d52284079732606c329390e83f45f3 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 17:10:19 -0500 Subject: [PATCH 01/38] first --- pandas/core/strings/object_array.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0268194e64d50..d6ae22a5aafdb 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -533,4 +533,5 @@ def f(x): else: return empty_row - return [f(val) for val in np.asarray(self)] + return [f(val) for val in np.asarray(self)] +## making a change that changes nothing to make sure I can push correctly From b59556f7b92a7662199593578d31b2e170c3050f Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 17:27:18 -0500 Subject: [PATCH 02/38] second --- .github/workflows/code-checks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index e1d2d1ea846b8..f1bdab700d4ca 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - ascii_edits - 2.3.x pull_request: branches: From f05bfb7d83ec54c0dd7298a34b8c24f134c3b0d9 Mon Sep 17 00:00:00 2001 From: avecasey <102306692+avecasey@users.noreply.github.com> Date: Sun, 8 Dec 2024 18:18:53 -0500 Subject: [PATCH 03/38] Update object_array.py --- pandas/core/strings/object_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index d6ae22a5aafdb..edf3a6529f5db 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -534,4 +534,3 @@ def f(x): return empty_row return [f(val) for val in np.asarray(self)] -## making a change that changes nothing to make sure I can push correctly From 74f558981ca18f833fde9f41176adac9146a7b73 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 18:40:20 -0500 Subject: [PATCH 04/38] third --- pandas/core/strings/object_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index d6ae22a5aafdb..96afd573467c3 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -533,5 +533,5 @@ def f(x): else: return empty_row - return [f(val) for val in np.asarray(self)] -## making a change that changes nothing to make sure I can push correctly + return [f(val) for val in np.asarray(self)] + \ No newline at end of file From 17b9a1a8cf8aafa4a730d5c14565f7bdad8f9d72 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 20:30:14 -0500 Subject: [PATCH 05/38] ascii --- pandas/core/strings/accessor.py | 45 ++++++++++++++++++++++- pandas/core/strings/base.py | 4 ++ pandas/core/strings/object_array.py | 3 ++ pandas/tests/strings/conftest.py | 1 + pandas/tests/strings/test_string_array.py | 1 + pandas/tests/strings/test_strings.py | 2 + 6 files changed, 55 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 05e1a36877e06..5bfe581020b2b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3385,7 +3385,7 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} @@ -3465,6 +3465,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3488,6 +3489,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3514,6 +3516,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3546,6 +3549,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3571,6 +3575,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3597,6 +3602,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3619,6 +3625,7 @@ def casefold(self): Series.str.isdigit : Check whether all characters are digits. Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. @@ -3644,6 +3651,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.istitle : Check whether all characters are titlecase. Examples @@ -3667,6 +3675,7 @@ def casefold(self): Series.str.isdecimal : Check whether all characters are decimal. Series.str.isspace : Check whether all characters are whitespace. Series.str.islower : Check whether all characters are lowercase. + Series.str.isascii : Check whether all characters are ascii. Series.str.isupper : Check whether all characters are uppercase. Examples @@ -3684,11 +3693,40 @@ def casefold(self): 3 False dtype: bool """ + _shared_docs["isascii"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Series.str.isupper : Check whether all characters are uppercase. + + Examples + ------------ + The ``s5.str.isascii`` method checks for whether all characters are ascii characters, + which includes digits 0-9, capital and lowercase letters A-Z, and some other + special characters. + + >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) + >>> s5.str.isascii() + 0 False + 1 True + 2 True + 3 True + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isascii"] = {"type": "ascii", "method": "isascii"} _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} @@ -3720,6 +3758,11 @@ def casefold(self): docstring=_shared_docs["ismethods"] % _doc_args["islower"] + _shared_docs["islower"], ) + isascii = _map_and_wrap( + "isascii", + docstring=_shared_docs["ismethods"] % _doc_args["isascii"] + + _shared_docs["isascii"], + ) isupper = _map_and_wrap( "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 4ed36f85167c9..78c4f3acbe1aa 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -179,6 +179,10 @@ def _str_isalnum(self): def _str_isalpha(self): pass + @abc.abstractmethod + def _str_isascii(self): + pass + @abc.abstractmethod def _str_isdecimal(self): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0268194e64d50..2451d2fd0bb83 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -454,6 +454,9 @@ def _str_isalnum(self): def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") + + def _str_isascii(self): + return self._str_map(str.isascii, dtype="bool") def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py index 92b7b16da3c1f..5bcbb16da3be9 100644 --- a/pandas/tests/strings/conftest.py +++ b/pandas/tests/strings/conftest.py @@ -68,6 +68,7 @@ "get_dummies", "isalnum", "isalpha", + "isascii", "isdecimal", "isdigit", "islower", diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index cd3c512328139..c5414022e664b 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -83,6 +83,7 @@ def test_string_array_numeric_integer_array(nullable_string_dtype, method, expec [ ("isdigit", [False, None, True]), ("isalpha", [True, None, False]), + ("isascii", [True, None, True]), ("isalnum", [True, None, True]), ("isnumeric", [False, None, True]), ], diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 75a2007b61640..0598e5f80e6d6 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -159,6 +159,7 @@ def test_empty_str_methods(any_string_dtype): # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) + tm.assert_series_equal(empty_bool, empty.str.isascii()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) @@ -177,6 +178,7 @@ def test_empty_str_methods(any_string_dtype): @pytest.mark.parametrize( "method, expected", [ + ("isascii", [True, True, True, True, True, True, True, True, True, True]), ("isalnum", [True, True, True, True, True, False, True, True, False, False]), ("isalpha", [True, True, True, False, False, False, True, False, False, False]), ( From 61f41947977aee9d0b72b525058b4cd5da664150 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 20:42:14 -0500 Subject: [PATCH 06/38] ascii2 --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 2d1b1eca55e98..0d326a7dbb900 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -252,6 +252,10 @@ def _str_isalnum(self): def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) + + def _str_isascii(self): + result = pc.utf8_is_ascii(self._pa_array) + return self._convert_bool_result(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From b430fd4c6dece5f6cb22aacc64b30c68fd044202 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 20:48:47 -0500 Subject: [PATCH 07/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 0d326a7dbb900..d55d2dde2b822 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -254,7 +254,7 @@ def _str_isalpha(self): return self._convert_bool_result(result) def _str_isascii(self): - result = pc.utf8_is_ascii(self._pa_array) + result = pc.ascii_is_ascii(self._pa_array) return self._convert_bool_result(result) def _str_isdecimal(self): From a15fb4ce5be91f1f1e1a9a17fdce843fdd1d2838 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 20:53:36 -0500 Subject: [PATCH 08/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index d55d2dde2b822..25fdeb8f9fd25 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -256,6 +256,17 @@ def _str_isalpha(self): def _str_isascii(self): result = pc.ascii_is_ascii(self._pa_array) return self._convert_bool_result(result) + + def _str_isascii(self): + if hasattr(pa.compute, "ascii_is_ascii"): + # If PyArrow adds support in the future, use it + return pa.compute.ascii_is_ascii(self._data) + else: + # Fallback: Use Python's native str.isascii() on each element + return [ + s.isascii() if isinstance(s, str) else None + for s in self.to_pylist() + ] def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From 6819ae0c391a46dec1d67ca0fdb601bc0c797560 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 20:55:30 -0500 Subject: [PATCH 09/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 25fdeb8f9fd25..1c82ee5e7747b 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -257,16 +257,21 @@ def _str_isascii(self): result = pc.ascii_is_ascii(self._pa_array) return self._convert_bool_result(result) - def _str_isascii(self): + def _str_isascii(self): if hasattr(pa.compute, "ascii_is_ascii"): - # If PyArrow adds support in the future, use it + # Use PyArrow's implementation if available return pa.compute.ascii_is_ascii(self._data) else: - # Fallback: Use Python's native str.isascii() on each element - return [ - s.isascii() if isinstance(s, str) else None - for s in self.to_pylist() - ] + # Fallback: Convert chunks to Python lists and apply str.isascii() + results = [] + for chunk in self._data.chunks: # Access individual PyArrow array chunks + results.extend( + [ + s.isascii() if isinstance(s, str) else None + for s in chunk.to_pylist() + ] + ) + return results def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From 0120d8e0e1acc41d09b8d0eaa51f56359d2698c7 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 21:02:40 -0500 Subject: [PATCH 10/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 38 ++++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 1c82ee5e7747b..6d7cc426abb57 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -257,21 +257,31 @@ def _str_isascii(self): result = pc.ascii_is_ascii(self._pa_array) return self._convert_bool_result(result) - def _str_isascii(self): - if hasattr(pa.compute, "ascii_is_ascii"): - # Use PyArrow's implementation if available - return pa.compute.ascii_is_ascii(self._data) + # def _str_isascii(self): + # if hasattr(pa.compute, "ascii_is_ascii"): + # # If PyArrow adds support in the future, use it + # return pa.compute.ascii_is_ascii(self._data) + # else: + # # Fallback: Use Python's native str.isascii() on each element + # return [ + # s.isascii() if isinstance(s, str) else None + # for s in self.to_pylist() + # ] + + def _str_isascii(self): + + # Use a hypothetical `pc.utf8_is_ascii` if PyArrow has implemented it + if hasattr(pc, "utf8_is_ascii"): + result = pc.utf8_is_ascii(self._pa_array) else: - # Fallback: Convert chunks to Python lists and apply str.isascii() - results = [] - for chunk in self._data.chunks: # Access individual PyArrow array chunks - results.extend( - [ - s.isascii() if isinstance(s, str) else None - for s in chunk.to_pylist() - ] - ) - return results + # Fallback: Apply Python's `str.isascii` manually and convert to PyArrow + pylist = [ + s.isascii() if isinstance(s, str) else None + for s in self._pa_array.to_pylist() + ] + + result = pa.array(pylist, type=pa.bool_()) + return self._convert_bool_result(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From 6bf826c5c395850f6fca4006a92a672f6a335be1 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 21:04:13 -0500 Subject: [PATCH 11/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 6d7cc426abb57..ab35d00d55fe9 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -280,7 +280,7 @@ def _str_isascii(self): for s in self._pa_array.to_pylist() ] - result = pa.array(pylist, type=pa.bool_()) + result = pa.array(pylist, type=pa.bool_(), from_pandas=True) return self._convert_bool_result(result) def _str_isdecimal(self): From f9eb9e16cb17caa1045bfc517bd8e730f38fa166 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 21:23:37 -0500 Subject: [PATCH 12/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index ab35d00d55fe9..6d7cc426abb57 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -280,7 +280,7 @@ def _str_isascii(self): for s in self._pa_array.to_pylist() ] - result = pa.array(pylist, type=pa.bool_(), from_pandas=True) + result = pa.array(pylist, type=pa.bool_()) return self._convert_bool_result(result) def _str_isdecimal(self): From 8a7c9e27e342d54166c31f00746ba6774f5115e8 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 21:47:44 -0500 Subject: [PATCH 13/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 6d7cc426abb57..3c208058ea3e1 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -276,7 +276,8 @@ def _str_isascii(self): else: # Fallback: Apply Python's `str.isascii` manually and convert to PyArrow pylist = [ - s.isascii() if isinstance(s, str) else None + True if s == "" else (s.isascii() if isinstance(s, str) else None) + #s.isascii() if isinstance(s, str) else None for s in self._pa_array.to_pylist() ] From 780c5d092676cf65ee59076fab7a8131e31f51a1 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:16:38 -0500 Subject: [PATCH 14/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 3c208058ea3e1..12a6efa087011 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -271,17 +271,16 @@ def _str_isascii(self): def _str_isascii(self): # Use a hypothetical `pc.utf8_is_ascii` if PyArrow has implemented it - if hasattr(pc, "utf8_is_ascii"): - result = pc.utf8_is_ascii(self._pa_array) - else: + #if hasattr(pc, "utf8_is_ascii"): + #result = pc.utf8_is_ascii(self._pa_array) + #else: # Fallback: Apply Python's `str.isascii` manually and convert to PyArrow - pylist = [ - True if s == "" else (s.isascii() if isinstance(s, str) else None) - #s.isascii() if isinstance(s, str) else None - for s in self._pa_array.to_pylist() - ] + pylist = [ + s.isascii() if isinstance(s, str) else None + for s in self._pa_array.to_pylist() + ] - result = pa.array(pylist, type=pa.bool_()) + result = pa.array(pylist, type=pa.bool_()) return self._convert_bool_result(result) def _str_isdecimal(self): From 5591ac3bcdc7fb952759c4f51f2b12d9878fc26e Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:26:19 -0500 Subject: [PATCH 15/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 12a6efa087011..4215a6484029d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -269,19 +269,14 @@ def _str_isascii(self): # ] def _str_isascii(self): - - # Use a hypothetical `pc.utf8_is_ascii` if PyArrow has implemented it - #if hasattr(pc, "utf8_is_ascii"): - #result = pc.utf8_is_ascii(self._pa_array) - #else: - # Fallback: Apply Python's `str.isascii` manually and convert to PyArrow pylist = [ s.isascii() if isinstance(s, str) else None for s in self._pa_array.to_pylist() ] result = pa.array(pylist, type=pa.bool_()) - return self._convert_bool_result(result) + #return self._convert_bool_result(result) + return result.to_numpy() def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From dc3f2e94a378944ac484478674deb3b3575274e1 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:28:49 -0500 Subject: [PATCH 16/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 4215a6484029d..ab2bc4c70f030 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -275,8 +275,7 @@ def _str_isascii(self): ] result = pa.array(pylist, type=pa.bool_()) - #return self._convert_bool_result(result) - return result.to_numpy() + return self._convert_bool_result(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) From e8c791f92b66bcc8ab2c170b3c96d717b0325cf9 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:43:42 -0500 Subject: [PATCH 17/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index ab2bc4c70f030..6c89ed8a9c00c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -268,15 +268,19 @@ def _str_isascii(self): # for s in self.to_pylist() # ] - def _str_isascii(self): - pylist = [ - s.isascii() if isinstance(s, str) else None - for s in self._pa_array.to_pylist() - ] + # def _str_isascii(self): + # pylist = [ + # s.isascii() if isinstance(s, str) else None + # for s in self._pa_array.to_pylist() + # ] - result = pa.array(pylist, type=pa.bool_()) + # result = pa.array(pylist, type=pa.bool_()) + # return self._convert_bool_result(result) + + def _str_isascii(self): + result = all(ord(char) < 128 for char in self) return self._convert_bool_result(result) - + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) From 4ecf88de22de94aaad9b1e1cc986e96a26c227d8 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:45:40 -0500 Subject: [PATCH 18/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 6c89ed8a9c00c..84e48183d4bb8 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -277,8 +277,22 @@ def _str_isascii(self): # result = pa.array(pylist, type=pa.bool_()) # return self._convert_bool_result(result) + # def _str_isascii(self): + # result = all(ord(char) < 128 for char in self) + # return self._convert_bool_result(result) + def _str_isascii(self): - result = all(ord(char) < 128 for char in self) + # Handle the case where self might be a pandas StringArray or similar + # Check if each value is not NA, and is a string of length 1 (single character) + def check_ascii(value): + if pd.isna(value): + return False # Or return True depending on how you want to handle NA values + if len(value) != 1: + raise ValueError(f"Expected a string of length 1, got: {value}") + return ord(value) < 128 + + # Apply the check to each element in `self` (assuming `self` is iterable like pandas Series) + result = all(check_ascii(char) for char in self) return self._convert_bool_result(result) def _str_isdecimal(self): From 1a6a26b595010742154c025267915cb3b5d7b7f1 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:46:53 -0500 Subject: [PATCH 19/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 84e48183d4bb8..11f0d6d998853 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -284,8 +284,11 @@ def _str_isascii(self): def _str_isascii(self): # Handle the case where self might be a pandas StringArray or similar # Check if each value is not NA, and is a string of length 1 (single character) + def isna(value): + return value is None + def check_ascii(value): - if pd.isna(value): + if isna(value): return False # Or return True depending on how you want to handle NA values if len(value) != 1: raise ValueError(f"Expected a string of length 1, got: {value}") From 502bdbdd345e711af8e47b6133a71951f63c16c4 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:48:43 -0500 Subject: [PATCH 20/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 36 +++++++++++----------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 11f0d6d998853..549b74d0b78d2 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -277,27 +277,27 @@ def _str_isascii(self): # result = pa.array(pylist, type=pa.bool_()) # return self._convert_bool_result(result) - # def _str_isascii(self): - # result = all(ord(char) < 128 for char in self) - # return self._convert_bool_result(result) - def _str_isascii(self): - # Handle the case where self might be a pandas StringArray or similar - # Check if each value is not NA, and is a string of length 1 (single character) - def isna(value): - return value is None - - def check_ascii(value): - if isna(value): - return False # Or return True depending on how you want to handle NA values - if len(value) != 1: - raise ValueError(f"Expected a string of length 1, got: {value}") - return ord(value) < 128 - - # Apply the check to each element in `self` (assuming `self` is iterable like pandas Series) - result = all(check_ascii(char) for char in self) + result = all(ord(char) < 128 for char in self) return self._convert_bool_result(result) + # def _str_isascii(self): + # # Handle the case where self might be a pandas StringArray or similar + # # Check if each value is not NA, and is a string of length 1 (single character) + # def isna(value): + # return value is None + + # def check_ascii(value): + # if isna(value): + # return False # Or return True depending on how you want to handle NA values + # if len(value) != 1: + # raise ValueError(f"Expected a string of length 1, got: {value}") + # return ord(value) < 128 + + # # Apply the check to each element in `self` (assuming `self` is iterable like pandas Series) + # result = all(check_ascii(char) for char in self) + # return self._convert_bool_result(result) + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) From 77744aad4d5ecd5c63b242918bc651356aac4b52 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:54:24 -0500 Subject: [PATCH 21/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 549b74d0b78d2..3fa75d5e21c8b 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -278,8 +278,11 @@ def _str_isascii(self): # return self._convert_bool_result(result) def _str_isascii(self): - result = all(ord(char) < 128 for char in self) - return self._convert_bool_result(result) + if isinstance(self, str): + result = all(ord(char) < 128 for char in self) + return self._convert_bool_result(result) + return None + # def _str_isascii(self): # # Handle the case where self might be a pandas StringArray or similar From cf90253572e2807b7217da2155aebc418c1b6499 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:55:51 -0500 Subject: [PATCH 22/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 25 ++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 3fa75d5e21c8b..b42c3195f6f42 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -277,11 +277,28 @@ def _str_isascii(self): # result = pa.array(pylist, type=pa.bool_()) # return self._convert_bool_result(result) + # def _str_isascii(self): + # if isinstance(self, str): + # result = all(ord(char) < 128 for char in self) + # return self._convert_bool_result(result) + # return None + def _str_isascii(self): - if isinstance(self, str): - result = all(ord(char) < 128 for char in self) - return self._convert_bool_result(result) - return None + # Assuming self._pa_array is a PyArrow array of strings + pylist = [] + for s in self._pa_array: + if s is None: # Handle None explicitly + pylist.append(None) + elif isinstance(s, str): + # Check if the string is ASCII using ord() on each character + result = all(ord(char) < 128 for char in s) + pylist.append(result) + else: + pylist.append(None) # If it's not a string, append None + + # Convert the result back to a PyArrow array (or pandas series if needed) + result = pa.array(pylist, type=pa.bool_()) # Use boolean type + return self._convert_bool_result(result) # def _str_isascii(self): From e37a6b1c3b3616f018cfe7a50523e25e9993d27e Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:57:42 -0500 Subject: [PATCH 23/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index b42c3195f6f42..2c24378ab3409 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -282,13 +282,12 @@ def _str_isascii(self): # result = all(ord(char) < 128 for char in self) # return self._convert_bool_result(result) # return None - def _str_isascii(self): - # Assuming self._pa_array is a PyArrow array of strings pylist = [] + for s in self._pa_array: if s is None: # Handle None explicitly - pylist.append(None) + pylist.append(None) # Keep None for PyArrow handling elif isinstance(s, str): # Check if the string is ASCII using ord() on each character result = all(ord(char) < 128 for char in s) @@ -296,8 +295,10 @@ def _str_isascii(self): else: pylist.append(None) # If it's not a string, append None - # Convert the result back to a PyArrow array (or pandas series if needed) - result = pa.array(pylist, type=pa.bool_()) # Use boolean type + # Convert the result back to a PyArrow array with nullable booleans (pa.bool_()) + result = pa.array(pylist, type=pa.bool_(), mask=[(v is None) for v in pylist]) + + # Use _convert_bool_result to process the result return self._convert_bool_result(result) From 1647cf9f5a0fa818ac91d28c95445bc69e03c652 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 22:58:40 -0500 Subject: [PATCH 24/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 26 ++++------------------ 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 2c24378ab3409..ba886d91c70cf 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -277,29 +277,11 @@ def _str_isascii(self): # result = pa.array(pylist, type=pa.bool_()) # return self._convert_bool_result(result) - # def _str_isascii(self): - # if isinstance(self, str): - # result = all(ord(char) < 128 for char in self) - # return self._convert_bool_result(result) - # return None def _str_isascii(self): - pylist = [] - - for s in self._pa_array: - if s is None: # Handle None explicitly - pylist.append(None) # Keep None for PyArrow handling - elif isinstance(s, str): - # Check if the string is ASCII using ord() on each character - result = all(ord(char) < 128 for char in s) - pylist.append(result) - else: - pylist.append(None) # If it's not a string, append None - - # Convert the result back to a PyArrow array with nullable booleans (pa.bool_()) - result = pa.array(pylist, type=pa.bool_(), mask=[(v is None) for v in pylist]) - - # Use _convert_bool_result to process the result - return self._convert_bool_result(result) + if isinstance(self, str): + result = all(ord(char) < 128 for char in self) + return self._convert_bool_result(result) + return False # def _str_isascii(self): From 9f9b0babf495bfdd694629e7eecf36e24f7183b3 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 23:00:13 -0500 Subject: [PATCH 25/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index ba886d91c70cf..4b1bd7ca25050 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -253,9 +253,6 @@ def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) - def _str_isascii(self): - result = pc.ascii_is_ascii(self._pa_array) - return self._convert_bool_result(result) # def _str_isascii(self): # if hasattr(pa.compute, "ascii_is_ascii"): From 767cc9524456a04877e7eeda52f6364e5b1c26b6 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 23:01:53 -0500 Subject: [PATCH 26/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 4b1bd7ca25050..7d0276156c456 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -274,12 +274,23 @@ def _str_isalpha(self): # result = pa.array(pylist, type=pa.bool_()) # return self._convert_bool_result(result) + # def _str_isascii(self): + # if isinstance(self, str): + # result = all(ord(char) < 128 for char in self) + # return self._convert_bool_result(result) + # return False + def _str_isascii(self): - if isinstance(self, str): - result = all(ord(char) < 128 for char in self) + # Check if the array is of string type (or if the implementation should handle it) + if isinstance(self._pa_array, pa.StringArray): + # Apply the isascii check element-wise using PyArrow's compute functions + result = pc.ascii_is_ascii(self._pa_array) + + # Convert the result to the appropriate format (e.g., PyArrow BooleanArray) return self._convert_bool_result(result) - return False - + + # If the input is not a StringArray, handle accordingly (e.g., return False or None) + return False # def _str_isascii(self): # # Handle the case where self might be a pandas StringArray or similar From ffa15c38fa229ea02e4504a3a3cea3235bcbde31 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Sun, 8 Dec 2024 23:02:34 -0500 Subject: [PATCH 27/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 7d0276156c456..eacae01b6f903 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -290,7 +290,7 @@ def _str_isascii(self): return self._convert_bool_result(result) # If the input is not a StringArray, handle accordingly (e.g., return False or None) - return False + return False # def _str_isascii(self): # # Handle the case where self might be a pandas StringArray or similar From cc92867daa0e17c37e6b69a3a50eb5a72c48ef29 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 13:07:12 -0500 Subject: [PATCH 28/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index eacae01b6f903..6420087f825b5 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -281,16 +281,11 @@ def _str_isalpha(self): # return False def _str_isascii(self): - # Check if the array is of string type (or if the implementation should handle it) - if isinstance(self._pa_array, pa.StringArray): - # Apply the isascii check element-wise using PyArrow's compute functions - result = pc.ascii_is_ascii(self._pa_array) + # Check if the array is of string type (or if the implementation should handle it) # Apply the isascii check element-wise using PyArrow's compute functions + result = pc.ascii_is_ascii(self._pa_array) # Convert the result to the appropriate format (e.g., PyArrow BooleanArray) - return self._convert_bool_result(result) - - # If the input is not a StringArray, handle accordingly (e.g., return False or None) - return False + return self._convert_bool_result(result) # def _str_isascii(self): # # Handle the case where self might be a pandas StringArray or similar From bea1cfaffc902e4c25fe70f87d798c3ea8878b88 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 13:10:43 -0500 Subject: [PATCH 29/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 6420087f825b5..f7bef42999e23 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -282,7 +282,7 @@ def _str_isalpha(self): def _str_isascii(self): # Check if the array is of string type (or if the implementation should handle it) # Apply the isascii check element-wise using PyArrow's compute functions - result = pc.ascii_is_ascii(self._pa_array) + result = pc.string_is_ascii(self._pa_array) # Convert the result to the appropriate format (e.g., PyArrow BooleanArray) return self._convert_bool_result(result) From 30d802af86eafab2eb25c67d17ac4c8c4f6ccfff Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 13:11:46 -0500 Subject: [PATCH 30/38] ascii3 --- pandas/core/arrays/_arrow_string_mixins.py | 47 ---------------------- 1 file changed, 47 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index f7bef42999e23..3ee4c7e1e82f8 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -252,58 +252,11 @@ def _str_isalnum(self): def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) - - - # def _str_isascii(self): - # if hasattr(pa.compute, "ascii_is_ascii"): - # # If PyArrow adds support in the future, use it - # return pa.compute.ascii_is_ascii(self._data) - # else: - # # Fallback: Use Python's native str.isascii() on each element - # return [ - # s.isascii() if isinstance(s, str) else None - # for s in self.to_pylist() - # ] - - # def _str_isascii(self): - # pylist = [ - # s.isascii() if isinstance(s, str) else None - # for s in self._pa_array.to_pylist() - # ] - - # result = pa.array(pylist, type=pa.bool_()) - # return self._convert_bool_result(result) - - # def _str_isascii(self): - # if isinstance(self, str): - # result = all(ord(char) < 128 for char in self) - # return self._convert_bool_result(result) - # return False def _str_isascii(self): - # Check if the array is of string type (or if the implementation should handle it) # Apply the isascii check element-wise using PyArrow's compute functions result = pc.string_is_ascii(self._pa_array) - - # Convert the result to the appropriate format (e.g., PyArrow BooleanArray) return self._convert_bool_result(result) - # def _str_isascii(self): - # # Handle the case where self might be a pandas StringArray or similar - # # Check if each value is not NA, and is a string of length 1 (single character) - # def isna(value): - # return value is None - - # def check_ascii(value): - # if isna(value): - # return False # Or return True depending on how you want to handle NA values - # if len(value) != 1: - # raise ValueError(f"Expected a string of length 1, got: {value}") - # return ord(value) < 128 - - # # Apply the check to each element in `self` (assuming `self` is iterable like pandas Series) - # result = all(check_ascii(char) for char in self) - # return self._convert_bool_result(result) - def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) From 74cf361306c3529c90371d7132b8d17d8ad32744 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 13:37:16 -0500 Subject: [PATCH 31/38] style --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- pandas/core/strings/accessor.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 3ee4c7e1e82f8..1ca52ce64bd77 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -252,11 +252,11 @@ def _str_isalnum(self): def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) return self._convert_bool_result(result) - + def _str_isascii(self): result = pc.string_is_ascii(self._pa_array) return self._convert_bool_result(result) - + def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5bfe581020b2b..5faaac59a5348 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3385,7 +3385,8 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle isascii + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower + # isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} @@ -3708,9 +3709,9 @@ def casefold(self): Examples ------------ - The ``s5.str.isascii`` method checks for whether all characters are ascii characters, - which includes digits 0-9, capital and lowercase letters A-Z, and some other - special characters. + The ``s5.str.isascii`` method checks for whether all characters are ascii + characters, which includes digits 0-9, capital and lowercase letters A-Z, + and some other special characters. >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) >>> s5.str.isascii() From a77fd7dab736bd0414ef6672203b65e733c32854 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 13:38:52 -0500 Subject: [PATCH 32/38] style --- pandas/core/strings/accessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5faaac59a5348..73f48e2f6dd65 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3385,7 +3385,7 @@ def len(self): # cases: # upper, lower, title, capitalize, swapcase, casefold # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower # isupper istitle isascii # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args: dict[str, dict[str, str]] = {} @@ -3709,8 +3709,8 @@ def casefold(self): Examples ------------ - The ``s5.str.isascii`` method checks for whether all characters are ascii - characters, which includes digits 0-9, capital and lowercase letters A-Z, + The ``s5.str.isascii`` method checks for whether all characters are ascii + characters, which includes digits 0-9, capital and lowercase letters A-Z, and some other special characters. >>> s5 = pd.Series(['ö', 'see123', 'hello world', '']) From f032dd73f891b8fb180618cb6a8b530761699408 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 14:20:22 -0500 Subject: [PATCH 33/38] style --- pandas/core/strings/object_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 2451d2fd0bb83..b4e1c2f0e4c53 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -451,13 +451,13 @@ def _str_upper(self): def _str_isalnum(self): return self._str_map(str.isalnum, dtype="bool") - + def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") def _str_isascii(self): return self._str_map(str.isascii, dtype="bool") - + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") From cebe5e76f00e14e1c1087eccdccdc853d85ef534 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 14:56:51 -0500 Subject: [PATCH 34/38] style --- pandas/core/strings/object_array.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index b4e1c2f0e4c53..a07ab9534f491 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -451,13 +451,13 @@ def _str_upper(self): def _str_isalnum(self): return self._str_map(str.isalnum, dtype="bool") - + def _str_isalpha(self): return self._str_map(str.isalpha, dtype="bool") - + def _str_isascii(self): return self._str_map(str.isascii, dtype="bool") - + def _str_isdecimal(self): return self._str_map(str.isdecimal, dtype="bool") From 5b12221b7e4e1ff68c469c8dc40d5b5b255a9812 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 19:59:07 -0500 Subject: [PATCH 35/38] docs --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab5746eca1b18..cd798e1caa857 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -64,6 +64,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` for :class:`StringMethods` (:issue:`59091`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From a7e659acac14a7a72c3423b0f644427d83755bf3 Mon Sep 17 00:00:00 2001 From: Abby VeCasey Date: Mon, 9 Dec 2024 20:27:32 -0500 Subject: [PATCH 36/38] reset --- .github/workflows/code-checks.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index f1bdab700d4ca..e1d2d1ea846b8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - ascii_edits - 2.3.x pull_request: branches: From 25a578a9f9b4291c6d8fbbad8b29645bd6e04e87 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 19:46:14 +0000 Subject: [PATCH 37/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2c4bd46c23cf4..b5e145553382e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -60,12 +60,12 @@ Other enhancements - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` for :class:`StringMethods` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` for :class:`StringMethods` (:issue:`59091`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From daff8505669ff6bf59034893307384ab1028aad2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 2 Jan 2025 11:48:02 -0800 Subject: [PATCH 38/38] Update doc/source/whatsnew/v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b5e145553382e..94c289ef3ace7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -60,7 +60,7 @@ Other enhancements - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) -- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` for :class:`StringMethods` (:issue:`59091`) +- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)