-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_lab.py
More file actions
41 lines (31 loc) · 1.19 KB
/
batch_lab.py
File metadata and controls
41 lines (31 loc) · 1.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import duckdb
from pathlib import Path
DATA = Path("data")
DATA.mkdir(exist_ok=True)
#1. Simulate "raw daily CSV"(bronze)
raw = pd.DataFrame({
"customer": ["Alice", "Bob", "Charlie", "David"],
"amount": [100, 200, 50, 300],
"date":["2024-01-01", "2024-01-01", "2024-01-02", "2024-01-02"]
})
# Save the raw DataFrame as CSV
raw.to_csv(DATA/"transactions_raw.csv", index=False) #does not include an index column
#2. Transform → Silver(clean, standardize types)
df = pd.read_csv(DATA/"transactions_raw.csv")
df["date"] = pd.to_datetime(df["date"])
df.to_parquet(DATA/"transactions_clean.parquet", index=False)
#3. Aggregate → Gold (daily totals)
summary = df.groupby(["customer", "date"], as_index = False).agg(total=("amount", "sum"))
summary.to_parquet(DATA/"transactions_summary.parquet", index=False)
#4. Query with DuckDB
con = duckdb.connect()
result = con.execute("""
SELECT customer, SUM(total) as total_spent
FROM read_parquet('data/transactions_summary.parquet')
GROUP BY customer
ORDER BY total_spent DESC
""").df()
print("\n[Batch ETL result]\n", result)
# Save batch result to a Parquet file
result.to_parquet(DATA/"batch_customer_totals.parquet", index=False)