diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index 53a8d28687518..2ebab54c517ea 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ +- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript + characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) - Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the "string" type in the JSON Table Schema for :class:`StringDtype` columns (:issue:`61889`) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 502dd8a1541f0..815c13844f6dd 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -16,6 +16,7 @@ HAS_PYARROW, pa_version_under13p0, pa_version_under17p0, + pa_version_under21p0, ) if HAS_PYARROW: @@ -259,6 +260,12 @@ def _str_isdecimal(self): return self._convert_bool_result(result) def _str_isdigit(self): + if pa_version_under21p0: + # https://github.com/pandas-dev/pandas/issues/61466 + res_list = self._apply_elementwise(str.isdigit) + return self._convert_bool_result( + pa.chunked_array(res_list, type=pa.bool_()) + ) result = pc.utf8_is_digit(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index fb3a3b8d60b6b..932dc187932b0 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -6,12 +6,15 @@ import numpy as np import pytest +from pandas.compat import pa_version_under21p0 + from pandas import ( NA, DataFrame, Index, MultiIndex, Series, + StringDtype, option_context, ) import pandas._testing as tm @@ -246,8 +249,9 @@ def test_ismethods(method, expected, any_string_dtype): @pytest.mark.parametrize( "method, expected", [ - ("isnumeric", [False, True, True, False, True, True, False]), - ("isdecimal", [False, True, False, False, False, True, False]), + ("isnumeric", [False, True, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, False, True, False]), + ("isdigit", [False, True, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): @@ -256,13 +260,23 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) expected_dtype = ( "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(expected, dtype=expected_dtype) + if ( + method == "isdigit" + and isinstance(ser.dtype, StringDtype) + and ser.dtype.storage == "pyarrow" + and not pa_version_under21p0 + ): + # known difference in behavior between python and pyarrow unicode handling + # pyarrow 21+ considers ¼ as a digit, while python does not + expected.iloc[3] = True + result = getattr(ser.str, method)() tm.assert_series_equal(result, expected)