-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathconvertCsvToParquet
More file actions
34 lines (25 loc) · 1.25 KB
/
convertCsvToParquet
File metadata and controls
34 lines (25 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option):
# Read CSV file into a Pandas DataFrame
df = pd.read_csv(input_file_path)
# Remove rows or columns with NaN fields based on the drop_option argument
if drop_option == 'row':
df = df.dropna()
elif drop_option == 'column':
df = df.dropna(axis=1)
# Convert Pandas DataFrame to PyArrow Table
table = pa.Table.from_pandas(df)
# Write PyArrow Table to Parquet file
pq.write_table(table, output_file_path)
# Open the Parquet file
table = pq.read_table(output_file_path)
# Convert the table to a Pandas DataFrame
df = table.to_pandas()
# Print the DataFrame
print(df.head(100))
input_file_path = "C:/Users/Idelson Mindo/Documents/GitHub/codema/dublin-energy-app/energytool/dublin-energy-app/data/dublin_census_2016_filled_with_ber_public_14_05_2021.csv"
output_file_path = "C:/Users/Idelson Mindo/Documents/GitHub/codema/dublin-energy-app/energytool/dublin-energy-app/data/dublin_census_2016_filled_with_ber_public_14_05_2021.parquet"
drop_option = 'column' # options: 'row' or 'column'
convert_csv_to_parquet(input_file_path, output_file_path, drop_option)