Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ For each PR milestone in TODO.md:
3. Update the TODO.md file to mark completed tasks
4. Create a pull request with a clear description of what was implemented
5. Wait for review and approval before moving to the next milestone
6. **PR titles must be written in English**

## Error Handling

Expand Down
20 changes: 10 additions & 10 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ After completing a milestone, create a pull request with your changes for review

## PR2: Data Import & Management

- [ ] Implement file upload functionality (CSV/Excel)
- [ ] Create data validation and error handling
- [ ] Implement data preview with pagination
- [ ] Add data type detection and conversion
- [ ] Set up session state management for data persistence
- [ ] Create data summary functionality
- [ ] Implement sidebar navigation for data options
- [ ] Add sample data loader option
- [ ] Create unit tests for data loading and validation
- [ ] Implement integration tests for data import workflow
- [x] Implement file upload functionality (CSV/Excel)
- [x] Create data validation and error handling
- [x] Implement data preview with pagination
- [x] Add data type detection and conversion
- [x] Set up session state management for data persistence
- [x] Create data summary functionality
- [x] Implement sidebar navigation for data options
- [x] Add sample data loader option
- [x] Create unit tests for data loading and validation
- [x] Implement integration tests for data import workflow

## PR3: Exploratory Data Analysis

Expand Down
57 changes: 45 additions & 12 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,61 @@

import streamlit as st
from utils import config
from utils import data as data_utils

st.set_page_config(page_title="PredictStream", layout="wide")


def main() -> None:
"""Render the main page."""
st.title("PredictStream")
st.write("Upload a dataset to get started or explore sample datasets.")

uploaded_file = st.file_uploader(
"Upload CSV or Excel file", type=["csv", "xlsx", "xls"], key="file_uploader"
)
with st.sidebar:
st.header("Data Options")
uploaded_file = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "xlsx", "xls"],
key="file_uploader",
)

st.subheader("Sample Datasets")
for name, path in config.SAMPLE_DATASETS.items():
if st.button(f"Load {name}"):
st.session_state["data"] = data_utils.load_data(path)
st.session_state["data"] = data_utils.convert_dtypes(
st.session_state["data"]
)
st.success(f"{name} loaded!")

if uploaded_file is not None:
st.success("File uploaded successfully!")

st.subheader("Sample Datasets")
for name, path in config.SAMPLE_DATASETS.items():
if st.button(f"Load {name}"):
st.session_state["uploaded_file"] = path.read_bytes()
st.success(f"{name} loaded!")
try:
df = data_utils.load_data(uploaded_file)
df = data_utils.convert_dtypes(df)
st.session_state["data"] = df
st.success("File loaded successfully!")
except ValueError as exc:
st.error(str(exc))

data = st.session_state.get("data")
if data is not None:
st.subheader("Data Preview")
page_size = 100
total_pages = max(1, (len(data) - 1) // page_size + 1)
page = st.number_input(
"Page",
min_value=1,
max_value=total_pages,
value=1,
step=1,
)
start = (page - 1) * page_size
end = start + page_size
st.dataframe(data.iloc[start:end])

st.subheader("Data Summary")
summary = data_utils.data_summary(data)
st.dataframe(summary)


if __name__ == "__main__":
main()
main()
48 changes: 48 additions & 0 deletions tests/test_data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import pytest

from utils import config
from utils import data


def test_load_data_csv(tmp_path):
df_exp = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
csv_file = tmp_path / 'test.csv'
df_exp.to_csv(csv_file, index=False)
df = data.load_data(csv_file)
pd.testing.assert_frame_equal(df, df_exp)


def test_load_data_excel(tmp_path):
df_exp = pd.DataFrame({'a': [1, 2]})
xls_file = tmp_path / 'test.xlsx'
df_exp.to_excel(xls_file, index=False)
df = data.load_data(xls_file)
pd.testing.assert_frame_equal(df, df_exp)


def test_load_data_invalid(tmp_path):
file = tmp_path / 'bad.txt'
file.write_text('x')
with pytest.raises(ValueError):
data.load_data(file)


def test_convert_dtypes():
df = pd.DataFrame({'num': ['1', '2'], 'date': ['2020-01-01', '2020-01-02']})
conv = data.convert_dtypes(df)
assert conv['num'].dtype.kind in {'i', 'f'}
assert pd.api.types.is_datetime64_any_dtype(conv['date'])


def test_data_summary():
df = pd.DataFrame({'a': [1, 2, 3]})
summary = data.data_summary(df)
assert 'a' in summary.columns


def test_sample_dataset_loads():
for path in config.SAMPLE_DATASETS.values():
df = data.load_data(path)
assert not df.empty

6 changes: 6 additions & 0 deletions utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Utility modules for PredictStream."""

from . import config
from . import data

__all__ = ["config", "data"]
38 changes: 38 additions & 0 deletions utils/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from __future__ import annotations

from pathlib import Path
from typing import Any

import pandas as pd


def load_data(file: Any) -> pd.DataFrame:
"""Load a CSV or Excel file into a DataFrame."""
if hasattr(file, "name"):
ext = Path(file.name).suffix.lower()
else:
ext = Path(str(file)).suffix.lower()

try:
if ext == ".csv":
return pd.read_csv(file)
if ext in {".xls", ".xlsx"}:
return pd.read_excel(file)
except Exception as exc: # pragma: no cover - pass through
raise ValueError(f"Failed to read file: {exc}") from exc
raise ValueError(f"Unsupported file type: {ext}")


def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
"""Attempt to convert object columns to numeric or datetime."""
for column in df.columns:
if df[column].dtype == object:
df[column] = pd.to_numeric(df[column], errors="ignore")
if df[column].dtype == object:
df[column] = pd.to_datetime(df[column], errors="ignore")
return df


def data_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Return a statistical summary of the dataframe."""
return df.describe(include="all")