Merge pull request #23 from kolibril13/string_limit

kolibril13 · web-flow · commit 102060addf7f · 2025-03-13T09:46:45.000+01:00
String limit
diff --git a/README.md b/README.md
@@ -63,6 +63,23 @@ uv run -m pytest
 
 # Changelog
 
+## Version 0.1.8
+
+- For performance reasons, the number of unique strings is limited to 3000 by default (processing this many strings takes about 7 seconds on a Mac M3)
+- Note: This limit applies to the number of unique strings, not the total count. For example, a CSV with 100,000 strings but only 100 unique values won't be affected by the limit.
+- You can override this limit when using the API directly:
+```py
+import string
+import numpy as np
+import polars as pl
+from csv_importer.parsers import polars_df_to_bob
+
+n = 4000
+
+random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]
+df = pl.DataFrame({"strings": random_strings, "numbers": np.arange(n)})
+bob = polars_df_to_bob(df, name="TestBob",string_limit =4001)
+```
 
 ## Version 0.1.7
 
diff --git a/csv_importer/parsers.py b/csv_importer/parsers.py
@@ -2,32 +2,41 @@
 import polars as pl
 import numpy as np
 import bpy
+import warnings
 
 
-def polars_df_to_bob(df: pl.DataFrame, name: str) -> db.BlenderObject:
+def polars_df_to_bob(df: pl.DataFrame, name: str, string_limit: int = 3000) -> db.BlenderObject:
     vertices = np.zeros((len(df), 3), dtype=np.float32)
     bob = db.create_bob(vertices, name=name)
 
-    update_bob_from_polars_df(bob, df)
+    update_bob_from_polars_df(bob, df, string_limit=string_limit)
     return bob
 
-
-def update_obj_from_csv(obj: bpy.types.Object, csv_file: str) -> None:
+ 
+def update_obj_from_csv(obj: bpy.types.Object, csv_file: str, string_limit: int = 3000) -> None:
     bob = db.BlenderObject(obj)
     df = pl.read_csv(csv_file)
     if len(df) != len(bob):
         bob.new_from_pydata(np.zeros((len(df), 3), dtype=np.float32))
-    update_bob_from_polars_df(bob, df)
+    update_bob_from_polars_df(bob, df, string_limit=string_limit)
 
 
-def update_bob_from_polars_df(bob: db.BlenderObject, df: pl.DataFrame) -> None:
+def update_bob_from_polars_df(bob: db.BlenderObject, df: pl.DataFrame, string_limit: int = 3000) -> None:
     for col in df.columns:
         col_dtype = df[col].dtype
-        if col_dtype in [pl.Utf8]:  # skip strings
-            data = np.vstack(df[col].fill_null("").to_numpy())
+        if col_dtype in [pl.Utf8]:  # handle strings
+            # Convert to numpy array and fill nulls with empty string
+            data = df[col].fill_null("").to_numpy()
             unique, encoding = np.unique(data, return_inverse=True)
-            bob.store_named_attribute(encoding, col)
-            db.nodes.custom_string_iswitch("{}: {}".format(bob.name, col), unique, col)
+            # Only add strings when there are less than the string limit
+            if len(unique) <= string_limit:
+                bob.store_named_attribute(encoding, col)
+                db.nodes.custom_string_iswitch("{}: {}".format(bob.name, col), unique, col)
+            else:
+                warning_message = f"Column '{col}' has {len(unique)} unique strings, which exceeds the limit of {string_limit}. This column will be skipped. You can increase the limit with the string_limit parameter."
+                warnings.warn(warning_message)
+                self = bpy.context.window_manager
+                self.popup_menu(lambda self, context: self.layout.label(text=warning_message), title="Warning", icon='ERROR')
         else:
             data = np.vstack(df[col].to_numpy())
             bob.store_named_attribute(data, col)
diff --git a/debugging_notebook.ipynb b/debugging_notebook.ipynb
@@ -400,12 +400,80 @@
     "    bob.store_named_attribute(data, col)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# self-contained: setting string attributes"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "\n",
+    "import numpy as np\n",
+    "import polars as pl\n",
+    "import databpy as db\n",
+    "\n",
+    "# Minimal data setup\n",
+    "df = pl.DataFrame({\n",
+    "    \"strings\": [\"apple\", \"banana\", \"apple\", None, \"orange\", \"banana\", \"kiwi\", \"apple\", \"kiwi\", \"banana\"],\n",
+    "    \"numbers\": np.arange(10),\n",
+    "})\n",
+    "\n",
+    "# random_verts dependent on df length\n",
+    "random_verts = np.random.rand(len(df), 3)\n",
+    "bob = db.create_bob(random_verts)\n",
+    "\n",
+    "for col in df.columns:\n",
+    "    if df[col].dtype == pl.Utf8:\n",
+    "        data = df[col].fill_null(\"\").to_numpy()\n",
+    "        unique, encoding = np.unique(data, return_inverse=True)\n",
+    "        bob.store_named_attribute(encoding, col)\n",
+    "        db.nodes.custom_string_iswitch(f\"{bob.name}: {col}\", unique, col)\n",
+    "    else:\n",
+    "        bob.store_named_attribute(df[col].to_numpy(), col)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# increased string limit\n",
+    "import string\n",
+    "import numpy as np\n",
+    "import polars as pl\n",
+    "import databpy as db\n",
+    "from csv_importer.parsers import update_bob_from_polars_df\n",
+    "\n",
+    "n = 4000\n",
+    "random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]\n",
+    "df = pl.DataFrame({\"strings\": random_strings, \"numbers\": np.arange(n)})\n",
+    "bob = db.create_bob(np.random.rand(n, 3))\n",
+    "update_bob_from_polars_df(bob, df, string_limit =4001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import numpy as np\n",
+    "import polars as pl\n",
+    "from csv_importer.parsers import polars_df_to_bob\n",
+    "\n",
+    "n = 1000\n",
+    "random_strings = [''.join(np.random.choice(list(string.ascii_lowercase), size=10)) for _ in range(n)]\n",
+    "df = pl.DataFrame({\"strings\": random_strings, \"numbers\": np.arange(n)})\n",
+    "bob = polars_df_to_bob(df, name=\"TestBob\")\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -2,8 +2,11 @@
 import polars as pl
 import numpy as np
 from io import StringIO
-from csv_importer.parsers import polars_df_to_bob
-
+from csv_importer.parsers import polars_df_to_bob, update_bob_from_polars_df
+import warnings
+import string
+import bpy
+from unittest.mock import patch
 
 @pytest.fixture
 def test_df():
@@ -57,3 +60,50 @@ def test_polars_df_to_bob_with_datatypes():
     # # Validate that "StringVal" does not exist as a numerical attribute
     # with pytest.raises(AttributeError):
     #     bob.named_attribute("StringVal")
+
+
+
+def test_string_limit_functionality():
+    """Test string limit functionality without using mocks."""
+    # Create test data with specific string values and a null
+    test_strings = ["apple", None, "banana", "apple", "cherry"]
+    test_data = {
+        "strings": test_strings,
+        "numbers": np.arange(len(test_strings))
+    }
+    df = pl.DataFrame(test_data)
+    
+    # Test with normal string limit (should process all strings)
+    bob = polars_df_to_bob(df, name="TestStringLimit", string_limit=10)
+    
+    # Verify the numeric column was processed
+    numbers_attr = bob.named_attribute("numbers")
+    assert np.array_equal(numbers_attr, np.arange(len(test_strings)))
+    
+    # Verify the string column was processed and encoded correctly
+    # The encoding should map: "" (for None) -> 0, "apple" -> 1, "banana" -> 2, "cherry" -> 3
+    # So the expected encoding is [1, 0, 2, 1, 3]
+    strings_attr = bob.named_attribute("strings")
+    expected_encoding = np.array([1, 0, 2, 1, 3])
+    assert np.array_equal(strings_attr, expected_encoding)
+    
+    # Test with very low string limit (should skip string column)
+    # Use a mocked warning instead of popup_menu which causes segfault
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        
+        # Use patch to avoid the read-only attribute error
+        with patch.object(bpy.context.window_manager.__class__, 'popup_menu', create=True, new=lambda *args, **kwargs: None):
+            # Create a new bob with a low string limit
+            limited_bob = polars_df_to_bob(df, name="LimitedStringTest", string_limit=2)
+            
+            # Check that a warning was raised
+            assert any("exceeds the limit" in str(warning.message) for warning in w)
+        
+            # Verify the numeric column was processed
+            numbers_attr = limited_bob.named_attribute("numbers")
+            assert np.array_equal(numbers_attr, np.arange(len(test_strings)))
+            
+            # Verify the string column was skipped (should raise an AttributeError)
+            with pytest.raises(AttributeError):
+                limited_bob.named_attribute("strings")