forked from WecoAI/aideml
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathnew-york-city-taxi-fare-prediction.py
116 lines (99 loc) · 3.49 KB
/
new-york-city-taxi-fare-prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
# Load a subset of the training data
train_df = pd.read_csv("./input/train.csv", nrows=500000)
# Remove missing values and outliers
train_df = train_df.dropna(how="any", axis="rows")
train_df = train_df[(train_df.fare_amount >= 2.5) & (train_df.fare_amount <= 500)]
train_df = train_df[(train_df.passenger_count > 0) & (train_df.passenger_count <= 6)]
train_df = train_df[
(train_df["pickup_latitude"] != 0) | (train_df["pickup_longitude"] != 0)
]
train_df = train_df[
(train_df["dropoff_latitude"] != 0) | (train_df["dropoff_longitude"] != 0)
]
# Feature engineering
def haversine_distance(lat1, lon1, lat2, lon2):
R = 6371 # radius of Earth in kilometers
phi1 = np.radians(lat1)
phi2 = np.radians(lat2)
delta_phi = np.radians(lat2 - lat1)
delta_lambda = np.radians(lon2 - lon1)
a = (
np.sin(delta_phi / 2) ** 2
+ np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
)
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
d = R * c
return d
train_df["pickup_datetime"] = pd.to_datetime(train_df["pickup_datetime"])
train_df["year"] = train_df["pickup_datetime"].dt.year
train_df["month"] = train_df["pickup_datetime"].dt.month
train_df["day"] = train_df["pickup_datetime"].dt.day
train_df["hour"] = train_df["pickup_datetime"].dt.hour
train_df["weekday"] = train_df["pickup_datetime"].dt.weekday
train_df["distance"] = haversine_distance(
train_df["pickup_latitude"],
train_df["pickup_longitude"],
train_df["dropoff_latitude"],
train_df["dropoff_longitude"],
)
# Select features and target variable
features = [
"year",
"month",
"day",
"hour",
"weekday",
"passenger_count",
"pickup_latitude",
"pickup_longitude",
"dropoff_latitude",
"dropoff_longitude",
"distance",
]
target = "fare_amount"
X = train_df[features]
y = train_df[target]
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the model
rf = RandomForestRegressor(n_estimators=50, max_depth=25, random_state=42)
rf.fit(X_train, y_train)
# Predict on validation set
y_pred = rf.predict(X_val)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")
# Prepare the test set
test_df = pd.read_csv("./input/test.csv")
test_df["pickup_datetime"] = pd.to_datetime(test_df["pickup_datetime"])
test_df["year"] = test_df["pickup_datetime"].dt.year
test_df["month"] = test_df["pickup_datetime"].dt.month
test_df["day"] = test_df["pickup_datetime"].dt.day
test_df["hour"] = test_df["pickup_datetime"].dt.hour
test_df["weekday"] = test_df["pickup_datetime"].dt.weekday
# Impute NaN values in the test set using median from the training set
for feature in [
"pickup_latitude",
"pickup_longitude",
"dropoff_latitude",
"dropoff_longitude",
]:
median_value = train_df[feature].median()
test_df[feature].fillna(median_value, inplace=True)
test_df["distance"] = haversine_distance(
test_df["pickup_latitude"],
test_df["pickup_longitude"],
test_df["dropoff_latitude"],
test_df["dropoff_longitude"],
)
# Predict on test set
X_test = test_df[features]
test_df["fare_amount"] = rf.predict(X_test)
# Save predictions
submission = test_df[["key", "fare_amount"]]
submission.to_csv("./working/submission.csv", index=False)