Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ def from_npd_df(df):
def _accept_array_string(v):
# TODO remove this once arctic keeps the string type under the hood
# and does not transform string into bytes
return type(v) in (str, bytes)
# Use strict type equality to support numpy string types (np.str_, np.bytes_)
# but not arbitrary subclasses (see issue #704)
return type(v) in (str, bytes, np.str_, np.bytes_)


def _is_nan(element):
Expand Down Expand Up @@ -187,7 +189,9 @@ def get_sample_from_non_empty_arr(arr, arr_name):

def coerce_string_column_to_fixed_length_array(arr, to_type, string_max_len):
# in python3 all text will be treated as unicode
if to_type == str:
# Use strict type equality to support numpy string types (np.str_)
# but not arbitrary subclasses (see issue #704)
if to_type in (str, np.str_):
if sys.platform == "win32":
# See https://sourceforge.net/p/numpy/mailman/numpy-discussion/thread/1139250278.7538.52.camel%40localhost.localdomain/#msg11998404
# Different wchar size on Windows is not compatible with our current internal representation of Numpy strings
Expand Down
58 changes: 58 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
DataFrameNormalizer,
NdArrayNormalizer,
NPDDataFrame,
_accept_array_string,
)
from arcticdb.version_store._common import TimeFrame
from arcticdb.util.test import (
Expand Down Expand Up @@ -378,6 +379,27 @@ def test_timestamp_without_tz():
assert rt_dt == dt


def test_accept_array_string_with_numpy_types():
"""Test that _accept_array_string correctly accepts numpy string types.

Regression test for https://github.com/man-group/ArcticDB/issues/2800
"""
# Regular Python types should be accepted
assert _accept_array_string("hello") is True
assert _accept_array_string(b"hello") is True

# NumPy string types should also be accepted
assert _accept_array_string(np.str_("hello")) is True
assert _accept_array_string(np.bytes_(b"hello")) is True

# Non-string types should be rejected
assert _accept_array_string(123) is False
assert _accept_array_string(12.34) is False
assert _accept_array_string([1, 2, 3]) is False
assert _accept_array_string({"key": "value"}) is False
assert _accept_array_string(None) is False


def test_column_with_mixed_types():
df = pd.DataFrame({"col": [1, "a"]})
with pytest.raises(ArcticDbNotYetImplemented):
Expand Down Expand Up @@ -901,6 +923,42 @@ def test_arrays_throw_without_pickling(lmdb_version_store_v1):
lib.write(sym, df)


def test_numpy_str_type_normalization(lmdb_version_store, sym):
"""Test that np.str_ types are correctly normalized.

Regression test for https://github.com/man-group/ArcticDB/issues/2800
"""
lib = lmdb_version_store

# Test np.str_ values in DataFrame
df = pd.DataFrame({"col": [np.str_("hello"), np.str_("world")]})
lib.write(sym, df)
result = lib.read(sym).data
# np.str_ values should be read back as regular strings
assert result["col"][0] == "hello"
assert result["col"][1] == "world"

# Test mixed np.str_ and regular str values
df_mixed = pd.DataFrame({"col": [np.str_("numpy_str"), "regular_str"]})
lib.write(sym + "_mixed", df_mixed)
result_mixed = lib.read(sym + "_mixed").data
assert result_mixed["col"][0] == "numpy_str"
assert result_mixed["col"][1] == "regular_str"


def test_numpy_bytes_type_normalization(lmdb_version_store, sym):
"""Test that np.bytes_ types are correctly normalized."""
lib = lmdb_version_store

# Test np.bytes_ values in DataFrame
df = pd.DataFrame({"col": [np.bytes_(b"hello"), np.bytes_(b"world")]})
lib.write(sym, df)
result = lib.read(sym).data
# np.bytes_ values should be read back as regular bytes
assert result["col"][0] == b"hello"
assert result["col"][1] == b"world"


def test_series_zero_name(lmdb_version_store, sym):
lib = lmdb_version_store
series = pd.Series(
Expand Down
Loading