dublin-energy-app/convertCsvToParquet at master · codema-dev/dublin-energy-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

def convert_csv_to_parquet(input_file_path, output_file_path, drop_option):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(100))

input_file_path = "C:/Users/Idelson Mindo/Documents/GitHub/codema/dublin-energy-app/energytool/dublin-energy-app/data/dublin_census_2016_filled_with_ber_public_14_05_2021.csv"
output_file_path = "C:/Users/Idelson Mindo/Documents/GitHub/codema/dublin-energy-app/energytool/dublin-energy-app/data/dublin_census_2016_filled_with_ber_public_14_05_2021.parquet"
drop_option = 'column'  # options: 'row' or 'column'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option)