apache · andygrove · Jul 18, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/examples/substrait.py b/examples/substrait.py
@@ -18,16 +18,15 @@
 from datafusion import SessionContext
 from datafusion import substrait as ss
 
+# TODO add user changing interface note to PR that datafusion.substrait.substrait is simplified to datafusion.substrait
 
 # Create a DataFusion context
 ctx = SessionContext()
 
 # Register table with context
 ctx.register_csv("aggregate_test_data", "./testing/data/csv/aggregate_test_100.csv")
 
-substrait_plan = ss.substrait.serde.serialize_to_plan(
-    "SELECT * FROM aggregate_test_data", ctx
-)
+substrait_plan = ss.serde.serialize_to_plan("SELECT * FROM aggregate_test_data", ctx)
 # type(substrait_plan) -> <class 'datafusion.substrait.plan'>
 
 # Encode it to bytes
@@ -38,17 +37,15 @@
 # Alternative serialization approaches
 # type(substrait_bytes) -> <class 'bytes'>, at this point the bytes can be distributed to file, network, etc safely
 # where they could subsequently be deserialized on the receiving end.
-substrait_bytes = ss.substrait.serde.serialize_bytes(
-    "SELECT * FROM aggregate_test_data", ctx
-)
+substrait_bytes = ss.serde.serialize_bytes("SELECT * FROM aggregate_test_data", ctx)
 
 # Imagine here bytes would be read from network, file, etc ... for example brevity this is omitted and variable is simply reused
 # type(substrait_plan) -> <class 'datafusion.substrait.plan'>
-substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes)
+substrait_plan = ss.serde.deserialize_bytes(substrait_bytes)
 
 # type(df_logical_plan) -> <class 'substrait.LogicalPlan'>
-df_logical_plan = ss.substrait.consumer.from_substrait_plan(ctx, substrait_plan)
+df_logical_plan = ss.consumer.from_substrait_plan(ctx, substrait_plan)
 
 # Back to Substrait Plan just for demonstration purposes
 # type(substrait_plan) -> <class 'datafusion.substrait.plan'>
-substrait_plan = ss.substrait.producer.to_substrait_plan(df_logical_plan)
+substrait_plan = ss.producer.to_substrait_plan(df_logical_plan)
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -25,64 +25,67 @@
 
 import pyarrow as pa
 
-from ._internal import (
-    AggregateUDF,
-    Config,
-    DataFrame,
+from .context import (
     SessionContext,
     SessionConfig,
     RuntimeConfig,
-    ScalarUDF,
     SQLOptions,
 )
 
+# The following imports are okay to remain as opaque to the user.
+from ._internal import Config
+
+from .udf import ScalarUDF, AggregateUDF
+
 from .common import (
     DFSchema,
 )
 
+from .dataframe import DataFrame
+
 from .expr import (
-    Alias,
-    Analyze,
+    #     Alias,
+    #     Analyze,
     Expr,
-    Filter,
-    Limit,
-    Like,
-    ILike,
-    Projection,
-    SimilarTo,
-    ScalarVariable,
-    Sort,
-    TableScan,
-    Not,
-    IsNotNull,
-    IsTrue,
-    IsFalse,
-    IsUnknown,
-    IsNotTrue,
-    IsNotFalse,
-    IsNotUnknown,
-    Negative,
-    InList,
-    Exists,
-    Subquery,
-    InSubquery,
-    ScalarSubquery,
-    GroupingSet,
-    Placeholder,
-    Case,
-    Cast,
-    TryCast,
-    Between,
-    Explain,
-    CreateMemoryTable,
-    SubqueryAlias,
-    Extension,
-    CreateView,
-    Distinct,
-    DropTable,
-    Repartition,
-    Partitioning,
-    Window,
+    #     Filter,
+    #     Limit,
+    #     Like,
+    #     ILike,
+    #     Projection,
+    #     SimilarTo,
+    #     ScalarVariable,
+    #     Sort,
+    #     TableScan,
+    #     Not,
+    #     IsNotNull,
+    #     IsTrue,
+    #     IsFalse,
+    #     IsUnknown,
+    #     IsNotTrue,
+    #     IsNotFalse,
+    #     IsNotUnknown,
+    #     Negative,
+    #     InList,
+    #     Exists,
+    #     Subquery,
+    #     InSubquery,
+    #     ScalarSubquery,
+    #     GroupingSet,
+    #     Placeholder,
+    #     Case,
+    #     Cast,
+    #     TryCast,
+    #     Between,
+    #     Explain,
+    #     CreateMemoryTable,
+    #     SubqueryAlias,
+    #     Extension,
+    #     CreateView,
+    #     Distinct,
+    #     DropTable,
+    #     Repartition,
+    #     Partitioning,
+    #     Window,
     WindowFrame,
 )
 
@@ -96,56 +99,55 @@
     "SQLOptions",
     "RuntimeConfig",
     "Expr",
-    "AggregateUDF",
     "ScalarUDF",
-    "Window",
+    # "Window",
     "WindowFrame",
     "column",
     "literal",
-    "TableScan",
-    "Projection",
+    # "TableScan",
+    # "Projection",
     "DFSchema",
-    "DFField",
-    "Analyze",
-    "Sort",
-    "Limit",
-    "Filter",
-    "Like",
-    "ILike",
-    "SimilarTo",
-    "ScalarVariable",
-    "Alias",
-    "Not",
-    "IsNotNull",
-    "IsTrue",
-    "IsFalse",
-    "IsUnknown",
-    "IsNotTrue",
-    "IsNotFalse",
-    "IsNotUnknown",
-    "Negative",
-    "ScalarFunction",
-    "BuiltinScalarFunction",
-    "InList",
-    "Exists",
-    "Subquery",
-    "InSubquery",
-    "ScalarSubquery",
-    "GroupingSet",
-    "Placeholder",
-    "Case",
-    "Cast",
-    "TryCast",
-    "Between",
-    "Explain",
-    "SubqueryAlias",
-    "Extension",
-    "CreateMemoryTable",
-    "CreateView",
-    "Distinct",
-    "DropTable",
-    "Repartition",
-    "Partitioning",
+    # "DFField",
+    # "Analyze",
+    # "Sort",
+    # "Limit",
+    # "Filter",
+    # "Like",
+    # "ILike",
+    # "SimilarTo",
+    # "ScalarVariable",
+    # "Alias",
+    # "Not",
+    # "IsNotNull",
+    # "IsTrue",
+    # "IsFalse",
+    # "IsUnknown",
+    # "IsNotTrue",
+    # "IsNotFalse",
+    # "IsNotUnknown",
+    # "Negative",
+    # "ScalarFunction",
+    # "BuiltinScalarFunction",
+    # "InList",
+    # "Exists",
+    # "Subquery",
+    # "InSubquery",
+    # "ScalarSubquery",
+    # "GroupingSet",
+    # "Placeholder",
+    # "Case",
+    # "Cast",
+    # "TryCast",
+    # "Between",
+    # "Explain",
+    # "SubqueryAlias",
+    # "Extension",
+    # "CreateMemoryTable",
+    # "CreateView",
+    # "Distinct",
+    # "DropTable",
+    # "Repartition",
+    # "Partitioning",
 ]
 
 
@@ -175,8 +177,6 @@ def column(value):
 
 
 def literal(value):
-    if not isinstance(value, pa.Scalar):
-        value = pa.scalar(value)
     return Expr.literal(value)
 
 
@@ -200,20 +200,20 @@ def udf(func, input_types, return_type, volatility, name=None):
     )
 
 
-def udaf(accum, input_type, return_type, state_type, volatility, name=None):
+def udaf(accum, input_types, return_type, state_type, volatility, name=None):
     """
     Create a new User Defined Aggregate Function
     """
     if not issubclass(accum, Accumulator):
         raise TypeError("`accum` must implement the abstract base class Accumulator")
     if name is None:
         name = accum.__qualname__.lower()
-    if isinstance(input_type, pa.lib.DataType):
-        input_type = [input_type]
+    if isinstance(input_types, pa.lib.DataType):
+        input_types = [input_types]
     return AggregateUDF(
         name=name,
         accumulator=accum,
-        input_type=input_type,
+        input_types=input_types,
         return_type=return_type,
         state_type=state_type,
         volatility=volatility,

diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import datafusion._internal as df_internal
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import pyarrow
+
+
+class Catalog:
+    def __init__(self, catalog: df_internal.Catalog) -> None:
+        self.catalog = catalog
+
+    def names(self) -> list[str]:
+        return self.catalog.names()
+
+    def database(self, name: str = "public") -> Database:
+        return Database(self.catalog.database(name))
+
+
+class Database:
+    def __init__(self, db: df_internal.Database) -> None:
+        self.db = db
+
+    def names(self) -> set[str]:
+        return self.db.names()
+
+    def table(self, name: str) -> Table:
+        return Table(self.db.table(name))
+
+
+class Table:
+    def __init__(self, table: df_internal.Table) -> None:
+        self.table = table
+
+    def schema(self) -> pyarrow.Schema:
+        return self.table.schema()
+
+    @property
+    def kind(self) -> str:
+        return self.table.kind()