chore: fix typos (#844)

mesejo · web-flow · commit 57eb9596e0a3 · 2024-09-02T08:51:52.000-06:00
- run [codespell](https://github.com/codespell-project/codespell) on the source code - change name of parameter in db-benchmark.dockerfile based on spelling suggestion and the documentation: https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/install.packages
diff --git a/benchmarks/db-benchmark/db-benchmark.dockerfile b/benchmarks/db-benchmark/db-benchmark.dockerfile
@@ -58,7 +58,7 @@ RUN cd pandas && \
 RUN cd modin && \
   virtualenv py-modin --python=/usr/bin/python3.10
 
-RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")'
+RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependencies=TRUE, repos="https://cloud.r-project.org")'
 
 SHELL ["/bin/bash", "-c"]
 
diff --git a/docs/mdbook/src/index.md b/docs/mdbook/src/index.md
@@ -18,7 +18,7 @@
 
 DataFusion is a blazing fast query engine that lets you run data analyses quickly and reliably.
 
-DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your langauge of choice.  You don't need to know any Rust to be a happy and productive user of DataFusion.
+DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your language of choice.  You don't need to know any Rust to be a happy and productive user of DataFusion.
 
 DataFusion lets you run queries faster than pandas.  Let's compare query runtimes for a 5GB CSV file with 100 million rows of data.
 
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
@@ -56,7 +56,7 @@ a.navbar-brand img {
 
 
 /* This is the bootstrap CSS style for "table-striped". Since the theme does
-not yet provide an easy way to configure this globaly, it easier to simply
+not yet provide an easy way to configure this globally, it easier to simply
 include this snippet here than updating each table in all rst files to
 add ":class: table-striped" */
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Documenation generation."""
+"""Documentation generation."""
 
 # Configuration file for the Sphinx documentation builder.
 #
diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst
@@ -21,7 +21,7 @@ Expressions
 ===========
 
 In DataFusion an expression is an abstraction that represents a computation.
-Expressions are used as the primary inputs and ouputs for most functions within
+Expressions are used as the primary inputs and outputs for most functions within
 DataFusion. As such, expressions can be combined to create expression trees, a
 concept shared across most compilers and databases.
 
diff --git a/examples/export.py b/examples/export.py
@@ -48,6 +48,6 @@
 pylist = df.to_pylist()
 assert pylist == [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
 
-# export to Pyton dictionary of columns
+# export to Python dictionary of columns
 pydict = df.to_pydict()
 assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6]}
diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py
@@ -28,7 +28,7 @@
 # question "return all of the rows that have a specific combination of these
 # values". We have the combinations we care about provided as a python
 # list of tuples. There is no built in function that supports this operation,
-# but it can be explicilty specified via a single expression or we can
+# but it can be explicitly specified via a single expression or we can
 # use a user defined function.
 
 ctx = SessionContext()
diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py
@@ -96,7 +96,7 @@
 # create a column of that value. We can then filter down any rows for which the cost and
 # minimum do not match.
 
-# The default window frame as of 5/6/2024 is from unbounded preceeding to the current row.
+# The default window frame as of 5/6/2024 is from unbounded preceding to the current row.
 # We want to evaluate the entire data frame, so we specify this.
 window_frame = datafusion.WindowFrame("rows", None, None)
 df = df.with_column(
diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py
@@ -53,9 +53,9 @@
 
 # Limit results to cases where commitment date before receipt date
 # Aggregate the results so we only get one row to join with the order table.
-# Alterately, and likely more idomatic is instead of `.aggregate` you could
+# Alternately, and likely more idiomatic is instead of `.aggregate` you could
 # do `.select_columns("l_orderkey").distinct()`. The goal here is to show
-# mulitple examples of how to use Data Fusion.
+# multiple examples of how to use Data Fusion.
 df_lineitem = df_lineitem.filter(col("l_commitdate") < col("l_receiptdate")).aggregate(
     [col("l_orderkey")], []
 )
diff --git a/examples/tpch/q06_forecasting_revenue_change.py b/examples/tpch/q06_forecasting_revenue_change.py
@@ -82,5 +82,5 @@
 
 revenue = df.collect()[0]["revenue"][0].as_py()
 
-# Note: the output value from this query may be dependant on the size of the database generated
+# Note: the output value from this query may be dependent on the size of the database generated
 print(f"Potential lost revenue: {revenue:.2f}")
diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py
@@ -77,7 +77,7 @@
 # the two nations of interest. Since there is no `otherwise()` statement, any values that do
 # not match these will result in a null value and then get filtered out.
 #
-# To do the same using a simle filter would be:
+# To do the same using a simple filter would be:
 # df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2))
 df_nation = df_nation.with_column(
     "n_name",
diff --git a/examples/tpch/q11_important_stock_identification.py b/examples/tpch/q11_important_stock_identification.py
@@ -63,7 +63,7 @@
 # Compute total value of specific parts
 df = df.aggregate([col("ps_partkey")], [F.sum(col("value")).alias("value")])
 
-# By default window functions go from unbounded preceeding to current row, but we want
+# By default window functions go from unbounded preceding to current row, but we want
 # to compute this sum across all rows
 window_frame = WindowFrame("rows", None, None)
 
diff --git a/examples/tpch/q15_top_supplier.py b/examples/tpch/q15_top_supplier.py
@@ -78,7 +78,7 @@
 # from the supplier table
 df = df.join(df_supplier, (["l_suppkey"], ["s_suppkey"]), "inner")
 
-# Return only the colums requested
+# Return only the columns requested
 df = df.select_columns("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue")
 
 # If we have more than one, sort by supplier number (suppkey)
diff --git a/examples/tpch/q20_potential_part_promotion.py b/examples/tpch/q20_potential_part_promotion.py
@@ -74,7 +74,7 @@
 # This will filter down the line items to the parts of interest
 df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner")
 
-# Compute the total sold and limit ourselves to indivdual supplier/part combinations
+# Compute the total sold and limit ourselves to individual supplier/part combinations
 df = df.aggregate(
     [col("l_partkey"), col("l_suppkey")], [F.sum(col("l_quantity")).alias("total_sold")]
 )
diff --git a/examples/tpch/q21_suppliers_kept_orders_waiting.py b/examples/tpch/q21_suppliers_kept_orders_waiting.py
@@ -74,7 +74,7 @@
 # only orders where this array is larger than one for multiple supplier orders. The second column
 # is all of the suppliers who failed to make their commitment. We can filter the second column for
 # arrays with size one. That combination will give us orders that had multiple suppliers where only
-# one failed. Use distinct=True in the blow aggregation so we don't get multipe line items from the
+# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the
 # same supplier reported in either array.
 df = df.aggregate(
     [col("o_orderkey")],
diff --git a/examples/tpch/q22_global_sales_opportunity.py b/examples/tpch/q22_global_sales_opportunity.py
@@ -45,14 +45,14 @@
 # The nation code is a two digit number, but we need to convert it to a string literal
 nation_codes = F.make_array(*[lit(str(n)) for n in NATION_CODES])
 
-# Use the substring operation to extract the first two charaters of the phone number
+# Use the substring operation to extract the first two characters of the phone number
 df = df_customer.with_column("cntrycode", F.substring(col("c_phone"), lit(0), lit(3)))
 
 # Limit our search to customers with some balance and in the country code above
 df = df.filter(col("c_acctbal") > lit(0.0))
 df = df.filter(~F.array_position(nation_codes, col("cntrycode")).is_null())
 
-# Compute the average balance. By default, the window frame is from unbounded preceeding to the
+# Compute the average balance. By default, the window frame is from unbounded preceding to the
 # current row. We want our frame to cover the entire data frame.
 window_frame = WindowFrame("rows", None, None)
 df = df.with_column(
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -436,7 +436,7 @@ def __init__(
 
         Example usage:
 
-        The following example demostrates how to use the context to execute
+        The following example demonstrates how to use the context to execute
         a query against a CSV data source using the :py:class:`DataFrame` API::
 
             from datafusion import SessionContext
@@ -853,7 +853,7 @@ def empty_table(self) -> DataFrame:
         return DataFrame(self.ctx.empty_table())
 
     def session_id(self) -> str:
-        """Retrun an id that uniquely identifies this :py:class:`SessionContext`."""
+        """Return an id that uniquely identifies this :py:class:`SessionContext`."""
         return self.ctx.session_id()
 
     def read_json(
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -515,7 +515,7 @@ def __init__(
 
         Args:
             units: Should be one of ``rows``, ``range``, or ``groups``.
-            start_bound: Sets the preceeding bound. Must be >= 0. If none, this
+            start_bound: Sets the preceding bound. Must be >= 0. If none, this
                 will be set to unbounded. If unit type is ``groups``, this
                 parameter must be set.
             end_bound: Sets the following bound. Must be >= 0. If none, this
diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
@@ -342,7 +342,7 @@ def concat(*args: Expr) -> Expr:
 def concat_ws(separator: str, *args: Expr) -> Expr:
     """Concatenates the list ``args`` with the separator.
 
-    ``NULL`` arugments are ignored. ``separator`` should not be ``NULL``.
+    ``NULL`` arguments are ignored. ``separator`` should not be ``NULL``.
     """
     args = [arg.expr for arg in args]
     return Expr(f.concat_ws(separator, args))
@@ -541,7 +541,7 @@ def ends_with(arg: Expr, suffix: Expr) -> Expr:
 
 
 def exp(arg: Expr) -> Expr:
-    """Returns the exponential of the arugment."""
+    """Returns the exponential of the argument."""
     return Expr(f.exp(arg.expr))
 
 
@@ -1593,7 +1593,7 @@ def grouping(arg: Expr, distinct: bool = False) -> Expr:
 
 
 def max(arg: Expr, distinct: bool = False) -> Expr:
-    """Returns the maximum value of the arugment."""
+    """Returns the maximum value of the argument."""
     return Expr(f.max(arg.expr, distinct=distinct))
 
 
@@ -1769,12 +1769,12 @@ def bit_xor(arg: Expr, distinct: bool = False) -> Expr:
 
 
 def bool_and(arg: Expr, distinct: bool = False) -> Expr:
-    """Computes the boolean AND of the arugment."""
+    """Computes the boolean AND of the argument."""
     return Expr(f.bool_and(arg.expr, distinct=distinct))
 
 
 def bool_or(arg: Expr, distinct: bool = False) -> Expr:
-    """Computes the boolean OR of the arguement."""
+    """Computes the boolean OR of the argument."""
     return Expr(f.bool_or(arg.expr, distinct=distinct))
 
 
diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py
@@ -66,7 +66,7 @@ def build_table(
             # Consume header row and count number of rows for statistics.
             # TODO: Possibly makes sense to have the eager number of rows
             # calculated as a configuration since you must read the entire file
-            # to get that information. However, this should only be occuring
+            # to get that information. However, this should only be occurring
             # at table creation time and therefore shouldn't
             # slow down query performance.
             with open(input_file, "r") as file:
@@ -75,7 +75,7 @@ def build_table(
                 print(header_row)
                 for _ in reader:
                     num_rows += 1
-            # TODO: Need to actually consume this row into resonable columns
+            # TODO: Need to actually consume this row into reasonable columns
             raise RuntimeError("TODO: Currently unable to support CSV input files.")
         else:
             raise RuntimeError(
diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py
@@ -153,7 +153,7 @@ def state(self) -> List[pyarrow.Scalar]:
 
     @abstractmethod
     def update(self, values: pyarrow.Array) -> None:
-        """Evalute an array of values and update state."""
+        """Evaluate an array of values and update state."""
         pass
 
     @abstractmethod
@@ -189,7 +189,7 @@ def __init__(
     ) -> None:
         """Instantiate a user defined aggregate function (UDAF).
 
-        See :py:func:`udaf` for a convenience function and arugment
+        See :py:func:`udaf` for a convenience function and argument
         descriptions.
         """
         self._udf = df_internal.AggregateUDF(
diff --git a/src/common/data_type.rs b/src/common/data_type.rs
@@ -40,7 +40,7 @@ pub enum RexType {
 /// Arrow types which represents the underlying arrow format
 /// Python types which represent the type in Python
 /// It is important to keep all of those types in a single
-/// and managable location. Therefore this structure exists
+/// and manageable location. Therefore this structure exists
 /// to map those types and provide a simple place for developers
 /// to map types from one system to another.
 #[derive(Debug, Clone)]
diff --git a/src/expr/table_scan.rs b/src/expr/table_scan.rs
@@ -94,7 +94,7 @@ impl PyTableScan {
 
     /// The column indexes that should be. Note if this is empty then
     /// all columns should be read by the `TableProvider`. This function
-    /// provides a Tuple of the (index, column_name) to make things simplier
+    /// provides a Tuple of the (index, column_name) to make things simpler
     /// for the calling code since often times the name is preferred to
     /// the index which is a lower level abstraction.
     #[pyo3(name = "projection")]

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`# specific language governing permissions and limitations`
`16`	`16`	`# under the License.`
`17`	`17`
`18`		`-"""Documenation generation."""`
	`18`	`+"""Documentation generation."""`
`19`	`19`
`20`	`20`	`# Configuration file for the Sphinx documentation builder.`
`21`	`21`	`#`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@`
`77`	`77`	# the two nations of interest. Since there is no `otherwise()` statement, any values that do
`78`	`78`	`# not match these will result in a null value and then get filtered out.`
`79`	`79`	`#`
`80`		`-# To do the same using a simle filter would be:`
	`80`	`+# To do the same using a simple filter would be:`
`81`	`81`	`# df_nation = df_nation.filter((F.col("n_name") == nation_1) \| (F.col("n_name") == nation_2))`
`82`	`82`	`df_nation = df_nation.with_column(`
`83`	`83`	`"n_name",`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@`
`74`	`74`	`# This will filter down the line items to the parts of interest`
`75`	`75`	`df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner")`
`76`	`76`
`77`		`-# Compute the total sold and limit ourselves to indivdual supplier/part combinations`
	`77`	`+# Compute the total sold and limit ourselves to individual supplier/part combinations`
`78`	`78`	`df = df.aggregate(`
`79`	`79`	`[col("l_partkey"), col("l_suppkey")], [F.sum(col("l_quantity")).alias("total_sold")]`
`80`	`80`	`)`