diff --git a/AGENTS.md b/AGENTS.md index 368e147..eb041ab 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -78,6 +78,7 @@ For each PR milestone in TODO.md: 3. Update the TODO.md file to mark completed tasks 4. Create a pull request with a clear description of what was implemented 5. Wait for review and approval before moving to the next milestone +6. **PR titles must be written in English** ## Error Handling diff --git a/TODO.md b/TODO.md index 75d5802..960c23b 100644 --- a/TODO.md +++ b/TODO.md @@ -19,16 +19,16 @@ After completing a milestone, create a pull request with your changes for review ## PR2: Data Import & Management -- [ ] Implement file upload functionality (CSV/Excel) -- [ ] Create data validation and error handling -- [ ] Implement data preview with pagination -- [ ] Add data type detection and conversion -- [ ] Set up session state management for data persistence -- [ ] Create data summary functionality -- [ ] Implement sidebar navigation for data options -- [ ] Add sample data loader option -- [ ] Create unit tests for data loading and validation -- [ ] Implement integration tests for data import workflow +- [x] Implement file upload functionality (CSV/Excel) +- [x] Create data validation and error handling +- [x] Implement data preview with pagination +- [x] Add data type detection and conversion +- [x] Set up session state management for data persistence +- [x] Create data summary functionality +- [x] Implement sidebar navigation for data options +- [x] Add sample data loader option +- [x] Create unit tests for data loading and validation +- [x] Implement integration tests for data import workflow ## PR3: Exploratory Data Analysis diff --git a/app.py b/app.py index 00b1da9..8249047 100644 --- a/app.py +++ b/app.py @@ -2,6 +2,7 @@ import streamlit as st from utils import config +from utils import data as data_utils st.set_page_config(page_title="PredictStream", layout="wide") @@ -9,21 +10,53 @@ def main() -> None: """Render the main page.""" st.title("PredictStream") - st.write("Upload a dataset to get started or explore sample datasets.") - uploaded_file = st.file_uploader( - "Upload CSV or Excel file", type=["csv", "xlsx", "xls"], key="file_uploader" - ) + with st.sidebar: + st.header("Data Options") + uploaded_file = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "xlsx", "xls"], + key="file_uploader", + ) + + st.subheader("Sample Datasets") + for name, path in config.SAMPLE_DATASETS.items(): + if st.button(f"Load {name}"): + st.session_state["data"] = data_utils.load_data(path) + st.session_state["data"] = data_utils.convert_dtypes( + st.session_state["data"] + ) + st.success(f"{name} loaded!") if uploaded_file is not None: - st.success("File uploaded successfully!") - - st.subheader("Sample Datasets") - for name, path in config.SAMPLE_DATASETS.items(): - if st.button(f"Load {name}"): - st.session_state["uploaded_file"] = path.read_bytes() - st.success(f"{name} loaded!") + try: + df = data_utils.load_data(uploaded_file) + df = data_utils.convert_dtypes(df) + st.session_state["data"] = df + st.success("File loaded successfully!") + except ValueError as exc: + st.error(str(exc)) + + data = st.session_state.get("data") + if data is not None: + st.subheader("Data Preview") + page_size = 100 + total_pages = max(1, (len(data) - 1) // page_size + 1) + page = st.number_input( + "Page", + min_value=1, + max_value=total_pages, + value=1, + step=1, + ) + start = (page - 1) * page_size + end = start + page_size + st.dataframe(data.iloc[start:end]) + + st.subheader("Data Summary") + summary = data_utils.data_summary(data) + st.dataframe(summary) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py new file mode 100644 index 0000000..d3dcff2 --- /dev/null +++ b/tests/test_data_utils.py @@ -0,0 +1,48 @@ +import pandas as pd +import pytest + +from utils import config +from utils import data + + +def test_load_data_csv(tmp_path): + df_exp = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + csv_file = tmp_path / 'test.csv' + df_exp.to_csv(csv_file, index=False) + df = data.load_data(csv_file) + pd.testing.assert_frame_equal(df, df_exp) + + +def test_load_data_excel(tmp_path): + df_exp = pd.DataFrame({'a': [1, 2]}) + xls_file = tmp_path / 'test.xlsx' + df_exp.to_excel(xls_file, index=False) + df = data.load_data(xls_file) + pd.testing.assert_frame_equal(df, df_exp) + + +def test_load_data_invalid(tmp_path): + file = tmp_path / 'bad.txt' + file.write_text('x') + with pytest.raises(ValueError): + data.load_data(file) + + +def test_convert_dtypes(): + df = pd.DataFrame({'num': ['1', '2'], 'date': ['2020-01-01', '2020-01-02']}) + conv = data.convert_dtypes(df) + assert conv['num'].dtype.kind in {'i', 'f'} + assert pd.api.types.is_datetime64_any_dtype(conv['date']) + + +def test_data_summary(): + df = pd.DataFrame({'a': [1, 2, 3]}) + summary = data.data_summary(df) + assert 'a' in summary.columns + + +def test_sample_dataset_loads(): + for path in config.SAMPLE_DATASETS.values(): + df = data.load_data(path) + assert not df.empty + diff --git a/utils/__init__.py b/utils/__init__.py index e69de29..832078c 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -0,0 +1,6 @@ +"""Utility modules for PredictStream.""" + +from . import config +from . import data + +__all__ = ["config", "data"] diff --git a/utils/data.py b/utils/data.py new file mode 100644 index 0000000..56a3abc --- /dev/null +++ b/utils/data.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd + + +def load_data(file: Any) -> pd.DataFrame: + """Load a CSV or Excel file into a DataFrame.""" + if hasattr(file, "name"): + ext = Path(file.name).suffix.lower() + else: + ext = Path(str(file)).suffix.lower() + + try: + if ext == ".csv": + return pd.read_csv(file) + if ext in {".xls", ".xlsx"}: + return pd.read_excel(file) + except Exception as exc: # pragma: no cover - pass through + raise ValueError(f"Failed to read file: {exc}") from exc + raise ValueError(f"Unsupported file type: {ext}") + + +def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame: + """Attempt to convert object columns to numeric or datetime.""" + for column in df.columns: + if df[column].dtype == object: + df[column] = pd.to_numeric(df[column], errors="ignore") + if df[column].dtype == object: + df[column] = pd.to_datetime(df[column], errors="ignore") + return df + + +def data_summary(df: pd.DataFrame) -> pd.DataFrame: + """Return a statistical summary of the dataframe.""" + return df.describe(include="all")