@@ -796,10 +796,34 @@ pub async fn fetch_statistics(
796
796
statistics_from_parquet_meta_calc ( & metadata, table_schema)
797
797
}
798
798
799
- /// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using [' StatisticsConverter`]
799
+ /// Convert statistics in [`ParquetMetaData`] into [`Statistics`] using [` StatisticsConverter`]
800
800
///
801
801
/// The statistics are calculated for each column in the table schema
802
802
/// using the row group statistics in the parquet metadata.
803
+ ///
804
+ /// # Key behaviors:
805
+ ///
806
+ /// 1. Extracts row counts and byte sizes from all row groups
807
+ /// 2. Applies schema type coercions to align file schema with table schema
808
+ /// 3. Collects and aggregates statistics across row groups when available
809
+ ///
810
+ /// # When there are no statistics:
811
+ ///
812
+ /// If the Parquet file doesn't contain any statistics (has_statistics is false), the function returns a Statistics object with:
813
+ /// - Exact row count
814
+ /// - Exact byte size
815
+ /// - All column statistics marked as unknown via Statistics::unknown_column(&table_schema)
816
+ /// # When only some columns have statistics:
817
+ ///
818
+ /// For columns with statistics:
819
+ /// - Min/max values are properly extracted and represented as Precision::Exact
820
+ /// - Null counts are calculated by summing across row groups
821
+ ///
822
+ /// For columns without statistics,
823
+ /// - For min/max, there are two situations:
824
+ /// 1. The column isn't in arrow schema, then min/max values are set to Precision::Absent
825
+ /// 2. The column is in arrow schema, but not in parquet schema due to schema revolution, min/max values are set to Precision::Exact(null)
826
+ /// - Null counts are set to Precision::Exact(num_rows) (conservatively assuming all values could be null)
803
827
pub fn statistics_from_parquet_meta_calc (
804
828
metadata : & ParquetMetaData ,
805
829
table_schema : SchemaRef ,
0 commit comments