Skip to content

Commit 102060a

Browse files
authored
Merge pull request #23 from kolibril13/string_limit
String limit
2 parents a78612b + 6d47d33 commit 102060a

File tree

4 files changed

+157
-13
lines changed

4 files changed

+157
-13
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,23 @@ uv run -m pytest
6363

6464
# Changelog
6565

66+
## Version 0.1.8
67+
68+
- For performance reasons, the number of unique strings is limited to 3000 by default (processing this many strings takes about 7 seconds on a Mac M3)
69+
- Note: This limit applies to the number of unique strings, not the total count. For example, a CSV with 100,000 strings but only 100 unique values won't be affected by the limit.
70+
- You can override this limit when using the API directly:
71+
```py
72+
import string
73+
import numpy as np
74+
import polars as pl
75+
from csv_importer.parsers import polars_df_to_bob
76+
77+
n = 4000
78+
79+
random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]
80+
df = pl.DataFrame({"strings": random_strings, "numbers": np.arange(n)})
81+
bob = polars_df_to_bob(df, name="TestBob",string_limit =4001)
82+
```
6683

6784
## Version 0.1.7
6885

csv_importer/parsers.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,41 @@
22
import polars as pl
33
import numpy as np
44
import bpy
5+
import warnings
56

67

7-
def polars_df_to_bob(df: pl.DataFrame, name: str) -> db.BlenderObject:
8+
def polars_df_to_bob(df: pl.DataFrame, name: str, string_limit: int = 3000) -> db.BlenderObject:
89
vertices = np.zeros((len(df), 3), dtype=np.float32)
910
bob = db.create_bob(vertices, name=name)
1011

11-
update_bob_from_polars_df(bob, df)
12+
update_bob_from_polars_df(bob, df, string_limit=string_limit)
1213
return bob
1314

14-
15-
def update_obj_from_csv(obj: bpy.types.Object, csv_file: str) -> None:
15+
16+
def update_obj_from_csv(obj: bpy.types.Object, csv_file: str, string_limit: int = 3000) -> None:
1617
bob = db.BlenderObject(obj)
1718
df = pl.read_csv(csv_file)
1819
if len(df) != len(bob):
1920
bob.new_from_pydata(np.zeros((len(df), 3), dtype=np.float32))
20-
update_bob_from_polars_df(bob, df)
21+
update_bob_from_polars_df(bob, df, string_limit=string_limit)
2122

2223

23-
def update_bob_from_polars_df(bob: db.BlenderObject, df: pl.DataFrame) -> None:
24+
def update_bob_from_polars_df(bob: db.BlenderObject, df: pl.DataFrame, string_limit: int = 3000) -> None:
2425
for col in df.columns:
2526
col_dtype = df[col].dtype
26-
if col_dtype in [pl.Utf8]: # skip strings
27-
data = np.vstack(df[col].fill_null("").to_numpy())
27+
if col_dtype in [pl.Utf8]: # handle strings
28+
# Convert to numpy array and fill nulls with empty string
29+
data = df[col].fill_null("").to_numpy()
2830
unique, encoding = np.unique(data, return_inverse=True)
29-
bob.store_named_attribute(encoding, col)
30-
db.nodes.custom_string_iswitch("{}: {}".format(bob.name, col), unique, col)
31+
# Only add strings when there are less than the string limit
32+
if len(unique) <= string_limit:
33+
bob.store_named_attribute(encoding, col)
34+
db.nodes.custom_string_iswitch("{}: {}".format(bob.name, col), unique, col)
35+
else:
36+
warning_message = f"Column '{col}' has {len(unique)} unique strings, which exceeds the limit of {string_limit}. This column will be skipped. You can increase the limit with the string_limit parameter."
37+
warnings.warn(warning_message)
38+
self = bpy.context.window_manager
39+
self.popup_menu(lambda self, context: self.layout.label(text=warning_message), title="Warning", icon='ERROR')
3140
else:
3241
data = np.vstack(df[col].to_numpy())
3342
bob.store_named_attribute(data, col)

debugging_notebook.ipynb

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,80 @@
400400
" bob.store_named_attribute(data, col)"
401401
]
402402
},
403+
{
404+
"cell_type": "markdown",
405+
"metadata": {},
406+
"source": [
407+
"# self-contained: setting string attributes"
408+
]
409+
},
403410
{
404411
"cell_type": "code",
405412
"execution_count": null,
406413
"metadata": {},
407414
"outputs": [],
408-
"source": []
415+
"source": [
416+
"\n",
417+
"import numpy as np\n",
418+
"import polars as pl\n",
419+
"import databpy as db\n",
420+
"\n",
421+
"# Minimal data setup\n",
422+
"df = pl.DataFrame({\n",
423+
" \"strings\": [\"apple\", \"banana\", \"apple\", None, \"orange\", \"banana\", \"kiwi\", \"apple\", \"kiwi\", \"banana\"],\n",
424+
" \"numbers\": np.arange(10),\n",
425+
"})\n",
426+
"\n",
427+
"# random_verts dependent on df length\n",
428+
"random_verts = np.random.rand(len(df), 3)\n",
429+
"bob = db.create_bob(random_verts)\n",
430+
"\n",
431+
"for col in df.columns:\n",
432+
" if df[col].dtype == pl.Utf8:\n",
433+
" data = df[col].fill_null(\"\").to_numpy()\n",
434+
" unique, encoding = np.unique(data, return_inverse=True)\n",
435+
" bob.store_named_attribute(encoding, col)\n",
436+
" db.nodes.custom_string_iswitch(f\"{bob.name}: {col}\", unique, col)\n",
437+
" else:\n",
438+
" bob.store_named_attribute(df[col].to_numpy(), col)"
439+
]
440+
},
441+
{
442+
"cell_type": "code",
443+
"execution_count": null,
444+
"metadata": {},
445+
"outputs": [],
446+
"source": [
447+
"# increased string limit\n",
448+
"import string\n",
449+
"import numpy as np\n",
450+
"import polars as pl\n",
451+
"import databpy as db\n",
452+
"from csv_importer.parsers import update_bob_from_polars_df\n",
453+
"\n",
454+
"n = 4000\n",
455+
"random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]\n",
456+
"df = pl.DataFrame({\"strings\": random_strings, \"numbers\": np.arange(n)})\n",
457+
"bob = db.create_bob(np.random.rand(n, 3))\n",
458+
"update_bob_from_polars_df(bob, df, string_limit =4001)"
459+
]
460+
},
461+
{
462+
"cell_type": "code",
463+
"execution_count": null,
464+
"metadata": {},
465+
"outputs": [],
466+
"source": [
467+
"import string\n",
468+
"import numpy as np\n",
469+
"import polars as pl\n",
470+
"from csv_importer.parsers import polars_df_to_bob\n",
471+
"\n",
472+
"n = 1000\n",
473+
"random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]\n",
474+
"df = pl.DataFrame({\"strings\": random_strings, \"numbers\": np.arange(n)})\n",
475+
"bob = polars_df_to_bob(df, name=\"TestBob\")\n"
476+
]
409477
}
410478
],
411479
"metadata": {

tests/test_parsers.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
import polars as pl
33
import numpy as np
44
from io import StringIO
5-
from csv_importer.parsers import polars_df_to_bob
6-
5+
from csv_importer.parsers import polars_df_to_bob, update_bob_from_polars_df
6+
import warnings
7+
import string
8+
import bpy
9+
from unittest.mock import patch
710

811
@pytest.fixture
912
def test_df():
@@ -57,3 +60,50 @@ def test_polars_df_to_bob_with_datatypes():
5760
# # Validate that "StringVal" does not exist as a numerical attribute
5861
# with pytest.raises(AttributeError):
5962
# bob.named_attribute("StringVal")
63+
64+
65+
66+
def test_string_limit_functionality():
67+
"""Test string limit functionality without using mocks."""
68+
# Create test data with specific string values and a null
69+
test_strings = ["apple", None, "banana", "apple", "cherry"]
70+
test_data = {
71+
"strings": test_strings,
72+
"numbers": np.arange(len(test_strings))
73+
}
74+
df = pl.DataFrame(test_data)
75+
76+
# Test with normal string limit (should process all strings)
77+
bob = polars_df_to_bob(df, name="TestStringLimit", string_limit=10)
78+
79+
# Verify the numeric column was processed
80+
numbers_attr = bob.named_attribute("numbers")
81+
assert np.array_equal(numbers_attr, np.arange(len(test_strings)))
82+
83+
# Verify the string column was processed and encoded correctly
84+
# The encoding should map: "" (for None) -> 0, "apple" -> 1, "banana" -> 2, "cherry" -> 3
85+
# So the expected encoding is [1, 0, 2, 1, 3]
86+
strings_attr = bob.named_attribute("strings")
87+
expected_encoding = np.array([1, 0, 2, 1, 3])
88+
assert np.array_equal(strings_attr, expected_encoding)
89+
90+
# Test with very low string limit (should skip string column)
91+
# Use a mocked warning instead of popup_menu which causes segfault
92+
with warnings.catch_warnings(record=True) as w:
93+
warnings.simplefilter("always")
94+
95+
# Use patch to avoid the read-only attribute error
96+
with patch.object(bpy.context.window_manager.__class__, 'popup_menu', create=True, new=lambda *args, **kwargs: None):
97+
# Create a new bob with a low string limit
98+
limited_bob = polars_df_to_bob(df, name="LimitedStringTest", string_limit=2)
99+
100+
# Check that a warning was raised
101+
assert any("exceeds the limit" in str(warning.message) for warning in w)
102+
103+
# Verify the numeric column was processed
104+
numbers_attr = limited_bob.named_attribute("numbers")
105+
assert np.array_equal(numbers_attr, np.arange(len(test_strings)))
106+
107+
# Verify the string column was skipped (should raise an AttributeError)
108+
with pytest.raises(AttributeError):
109+
limited_bob.named_attribute("strings")

0 commit comments

Comments
 (0)