-
Notifications
You must be signed in to change notification settings - Fork 86
Expand file tree
/
Copy pathData cleaning using Pandas
More file actions
66 lines (46 loc) · 2 KB
/
Data cleaning using Pandas
File metadata and controls
66 lines (46 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
import numpy as np
print("/// 1. Initial Data Setup (with Issues)///")
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'Bob', 'Eve', 'David'],
'Age': [25, 30, np.nan, 30, 45, 22],
'Sales_Amount': ['1,500 USD', '800 USD', '1200 USD', '800 USD', '3,200 USD', '450 USD'],
'Category': [' Fruit ', 'Vegetable', 'Fruit', 'Vegetable', 'VEGETABLE', ' Dairy '],
'Join_Date': ['2023-01-15', '2022-11-01', '2023-05-20', '2022-11-01', '2023-03-10', '2023-07-25']
}
df = pd.DataFrame(data)
print("\nOriginal DataFrame (Before Cleaning):")
print(df)
print("\nData Types (Before Cleaning):")
print(df.dtypes)
print("-" * 50)
print("/// Handling Missing Values ('Age' Column)")
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)
print(f"\nMissing values in 'Age' filled with Median Age: {median_age}")
print("\nDataFrame after Missing Value Handling:")
print(df)
print("-" * 50)
print("/// Removing Duplicate Rows")
print(f"\nNumber of duplicate rows found: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print("\nDataFrame after Removing Duplicates:")
print(df)
print(f"\nNumber of rows after removing duplicates: {len(df)}")
print("-" * 50)
print("///Correcting Data Types ('Sales_Amount', 'Join_Date')")
print("\nOriginal 'Sales_Amount' data type:", df['Sales_Amount'].dtype)
df['Sales_Amount'] = df['Sales_Amount'].str.replace(',', '').str.replace(' USD', '')
df['Sales_Amount'] = df['Sales_Amount'].astype(float)
print("New 'Sales_Amount' data type:", df['Sales_Amount'].dtype)
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
print("New 'Join_Date' data type:", df['Join_Date'].dtype)
print("\nDataFrame after Data Type Corrections:")
print(df)
print("-" * 50)
print("//// Standardizing Text ('Category' Column)")
df['Category'] = df['Category'].str.strip()
df['Category'] = df['Category'].str.lower()
print("\nUnique values in 'Category' before standardization:")
print(f"\nUnique values in 'Category' after standardization: {df['Category'].unique()}")
print("