Merge pull request #2964 from mabel-dev/#2963

joocer · web-flow · commit 09cf0b67a348 · 2025-11-28T16:16:28.000Z
Iceberg Fixes #2963, #2962, #2961
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1965
+__build__ = 1966
 __author__ = "@joocer"
-__version__ = "0.26.2-beta.1965"
+__version__ = "0.26.2-beta.1966"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/compiled/joins/inner_join.pyx b/opteryx/compiled/joins/inner_join.pyx
@@ -15,6 +15,7 @@ from libc.stddef cimport size_t
 from libcpp.vector cimport vector
 
 from time import perf_counter_ns
+cimport cython
 
 from opteryx.third_party.abseil.containers cimport (
     FlatHashMap,
@@ -75,15 +76,16 @@ cpdef tuple inner_join(object right_relation, list join_columns, FlatHashMap lef
     last_hash_time_ns = t_after_hash - t_start
 
     with nogil:
-        inner_join_probe(
-            &left_hash_table._map,
-            &non_null_indices[0],
-            <size_t>candidate_count,
-            &row_hashes[0],
-            <size_t>num_rows,
-            left_indexes.c_buffer,
-            right_indexes.c_buffer,
-        )
+        with cython.boundscheck(False):
+            inner_join_probe(
+                &left_hash_table._map,
+                &non_null_indices[0],
+                <size_t>candidate_count,
+                &row_hashes[0],
+                <size_t>num_rows,
+                left_indexes.c_buffer,
+                right_indexes.c_buffer,
+            )
     cdef long long t_after_probe = perf_counter_ns()
     last_probe_time_ns = t_after_probe - t_after_hash
     last_rows_hashed = num_rows
diff --git a/opteryx/connectors/capabilities/statistics.py b/opteryx/connectors/capabilities/statistics.py
@@ -68,6 +68,18 @@ def prune_blobs(self, blob_names: list[str], query_statistics, selection) -> lis
                     and cond.right.schema_column.type
                     not in (OrsoTypes.DATE, OrsoTypes.TIME, OrsoTypes.TIMESTAMP)
                 ]
+                # Handle AnyOp* comparisons where left is a literal and right is an identifier
+                valid_conditions_any = [
+                    cond
+                    for cond in selection
+                    if cond.value == "AnyOpEq"
+                    and cond.left.node_type == NodeType.LITERAL
+                    and cond.right.node_type == NodeType.IDENTIFIER
+                    and cond.right.schema_column.type
+                    not in (OrsoTypes.DATE, OrsoTypes.TIME, OrsoTypes.TIMESTAMP)
+                    and cond.left.schema_column.type
+                    not in (OrsoTypes.DATE, OrsoTypes.TIME, OrsoTypes.TIMESTAMP)
+                ]
 
                 for condition in valid_conditions:
                     column_name = condition.left.source_column.encode()
@@ -87,6 +99,35 @@ def prune_blobs(self, blob_names: list[str], query_statistics, selection) -> lis
                             skip_blob = True
                             break
 
+                # Evaluate AnyOp* conditions (literal = ANY(column)) safely using element min/max
+                for condition in valid_conditions_any:
+                    column_name = condition.right.source_column.encode()
+                    literal_value = condition.left.value
+                    # Skip NULL literals — unsafe to prune
+                    if literal_value is None:
+                        continue
+                    if type(literal_value) is numpy.datetime64:
+                        # convert to python datetime for consistent to_int conversion
+                        literal_value = literal_value.astype("M8[ms]").astype("O")
+                    if hasattr(literal_value, "item"):
+                        literal_value = literal_value.item()
+                    literal_value = to_int(literal_value)
+                    # Skip NULL_FLAG values (NaN/unconvertible) that appear as the NULL sentinel
+                    NULL_FLAG = -(1 << 63)
+                    if literal_value == NULL_FLAG:
+                        continue
+                    max_value = cached_stats.upper_bounds.get(column_name)
+                    min_value = cached_stats.lower_bounds.get(column_name)
+
+                    if max_value is not None and min_value is not None:
+                        # convert AnyOpEq -> Eq, AnyOpGt -> Gt, etc.
+                        op_name = condition.value[5:]
+                        prune = handlers.get(op_name)
+                        if prune and prune(literal_value, min_value, max_value):
+                            query_statistics.blobs_pruned += 1
+                            skip_blob = True
+                            break
+
             if not skip_blob:
                 new_blob_names.append(blob_name)
 
diff --git a/opteryx/connectors/iceberg_connector.py b/opteryx/connectors/iceberg_connector.py
@@ -276,6 +276,14 @@ def read_dataset(
             )
         )
 
+        # Short-cut COUNT(*) handling
+        if selected_columns == []:
+            table = pyarrow.Table.from_arrays(
+                [[self.relation_statistics.record_count]], names=["$COUNT(*)"]
+            )
+            yield table
+            return
+
         reader = self.table.scan(
             row_filter=pushed_filters,
             selected_fields=selected_columns,
@@ -356,7 +364,7 @@ def decode_iceberg_value(
         elif data_type_class == pyiceberg.types.DoubleType:
             # IEEE 754 encoded floats are typically decoded directly
             return struct.unpack("<d", value)[0]  # 8-byte IEEE 754 double
-        elif data_type_class == pyiceberg.types.TimestampType:
+        elif data_type_class in (pyiceberg.types.TimestampType, pyiceberg.types.TimestamptzType):
             # Iceberg stores timestamps as microseconds since epoch
             interval = int.from_bytes(value, "little", signed=True)
             if interval < 0:
@@ -378,5 +386,5 @@ def decode_iceberg_value(
             return Decimal(int_value) / (10**data_type.scale)
         elif data_type_class == pyiceberg.types.BooleanType:
             return bool(value)
-        else:
-            raise ValueError(f"Unsupported data type: {data_type}, {str(data_type)}")
+
+        ValueError(f"Unsupported data type: {data_type}, {str(data_type)}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.2-beta.1965"
+version = "0.26.2-beta.1966"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -794,6 +794,15 @@ def _ensure_two_snapshots(identifier):
     with contextlib.suppress(NamespaceAlreadyExistsError):
         catalog.create_namespace("opteryx")
 
+    _epoch_schema = pyarrow.schema([
+        pyarrow.field("epoch", pyarrow.timestamp("ms", tz="UTC"))
+    ])
+    _epoch_table = pyarrow.Table.from_arrays(
+        [pyarrow.array([datetime.datetime.now(datetime.timezone.utc)], type=_epoch_schema.field("epoch").type)],
+        schema=_epoch_schema
+    )
+    catalog.create_table("opteryx.epoch", schema=_epoch_schema).append(_epoch_table)
+
     data = opteryx.query_to_arrow("SELECT tweet_id, text, timestamp, user_id, user_verified, user_name, hash_tags, followers, following, tweets_by_user, is_quoting, is_reply_to, is_retweeting FROM testdata.flat.formats.parquet")
     table = catalog.create_table("opteryx.tweets", schema=data.schema)
     table.append(data.slice(0, 50000))
diff --git a/tests/compiled/joins/test_build_side_hash_map.py b/tests/compiled/joins/test_build_side_hash_map.py
@@ -0,0 +1,33 @@
+import numpy
+import pyarrow as pa
+
+from opteryx.compiled.joins.join_definitions import (
+    build_side_hash_map,
+)
+from opteryx.compiled.table_ops.hash_ops import compute_hashes
+from opteryx.compiled.table_ops.null_avoidant_ops import non_null_indices
+
+
+def test_build_side_hash_map_basic_values():
+    """
+    Verify that build_side_hash_map produces a FlatHashMap containing a mapping for
+    every non-null row hash computed from a small VALUES table.
+    """
+    table = pa.table({"x": [1, 2, 3]})
+
+    # Build the Cython-side hash map
+    ht = build_side_hash_map(table, ["x"])
+
+    # Compute the row hashes and non-null indices via the compiled helper
+    num_rows = table.num_rows
+    row_hashes = compute_hashes(table, ["x"])  # returns array.array('Q')
+    non_nulls = non_null_indices(table, ["x"])  # typed memoryview
+
+    # For every non-null row, ensure the hash map returns a non-empty list
+    for i in range(non_nulls.shape[0]):
+        row_idx = int(non_nulls[i])
+        key = int(row_hashes[row_idx])
+        found = ht.get(key)
+        assert found is not None and len(found) >= 1, f"Missing mapping for hash {key} (row {row_idx})"
+        assert row_idx in found, f"Row idx {row_idx} not present in mapping for hash {key}: {found}"
+
diff --git a/tests/unit/connectors/test_statistics_pruning.py b/tests/unit/connectors/test_statistics_pruning.py