diff --git a/Makefile b/Makefile index e025fa6..b955e2a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,18 @@ +ifeq (,$(wildcard .env)) +$(error .env file is missing. Please create one based on .env.example) +endif + +include .env + export PYTHONPATH = . -check_dirs := src + +# --- Default Values --- + +CHECK_DIRS := src +LOCAL_DATA_PATH := data + + +# --- Utilities --- help: @grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done @@ -26,6 +39,16 @@ local-infrastructure-up: local-docker-infrastructure-up local-zenml-server-down local-infrastructure-down: local-docker-infrastructure-down local-zenml-server-down +# --- AWS --- + +s3-upload: # Upload a local folder to S3 + @echo "Uploading to S3 bucket: $(AWS_S3_BUCKET_NAME)" + uv run python -m tools.use_s3 upload $(LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix "second_brain_course/notion" + +s3-download: # Download from S3 to local folder using AWS + @echo "Downloading from S3 bucket: $(AWS_S3_BUCKET_NAME)" + uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) "second_brain_course/notion/data.zip" $(LOCAL_DATA_PATH) + # --- Pipelines --- collect-notion-pipeline: @@ -39,16 +62,16 @@ test: # --- QA --- format-fix: - uv run ruff format $(check_dirs) + uv run ruff format $(CHECK_DIRS) uv run ruff check --select I --fix lint-fix: uv run ruff check --fix format-check: - uv run ruff format --check $(check_dirs) + uv run ruff format --check $(CHECK_DIRS) uv run ruff check -e uv run ruff check --select I -e lint-check: - uv run ruff check $(check_dirs) + uv run ruff check $(CHECK_DIRS) diff --git a/src/second_brain/config.py b/src/second_brain/config.py index 0171129..15eed47 100644 --- a/src/second_brain/config.py +++ b/src/second_brain/config.py @@ -9,8 +9,8 @@ class Settings(BaseSettings): NOTION_SECRET_KEY: str | None = None # --- Required settings even when working locally. --- - - AWS_S3_BUCKET_NAME: str = "notion-second-brain-data" + AWS_DEFAULT_REGION: str = "eu-central-1" + AWS_S3_BUCKET_NAME: str = "decodingml-public-data" # OpenAI API OPENAI_MODEL_ID: str = "gpt-4o-mini" @@ -34,7 +34,6 @@ class Settings(BaseSettings): DATABASE_NAME: str = "twin" # AWS Authentication - AWS_REGION: str = "eu-central-1" AWS_ACCESS_KEY: str | None = None AWS_SECRET_KEY: str | None = None AWS_ARN_ROLE: str | None = None diff --git a/src/second_brain/infrastructure/aws/s3.py b/src/second_brain/infrastructure/aws/s3.py index 1622ad8..6fce0a3 100644 --- a/src/second_brain/infrastructure/aws/s3.py +++ b/src/second_brain/infrastructure/aws/s3.py @@ -6,11 +6,21 @@ import boto3 +from second_brain.config import settings + class S3Client: - def __init__(self, bucket_name: str) -> None: - """Initialize S3 client and bucket name.""" - self.s3_client = boto3.client("s3") + def __init__( + self, bucket_name: str, region: str = settings.AWS_DEFAULT_REGION + ) -> None: + """Initialize S3 client and bucket name. + + Args: + bucket_name: Name of the S3 bucket + region: AWS region (defaults to AWS_DEFAULT_REGION or AWS_REGION env var, or 'us-east-1') + """ + self.region = region + self.s3_client = boto3.client("s3", region_name=self.region) self.bucket_name = bucket_name def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> None: @@ -21,6 +31,9 @@ def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> No local_path: Path to the local folder s3_prefix: Optional prefix (folder path) in S3 bucket """ + # Ensure bucket exists before proceeding + self.__create_bucket_if_doesnt_exist() + local_path = Path(local_path) if not local_path.exists(): @@ -49,6 +62,30 @@ def upload_folder(self, local_path: Union[str, Path], s3_prefix: str = "") -> No # Clean up temporary zip file os.unlink(temp_zip.name) + def __create_bucket_if_doesnt_exist(self) -> None: + """ + Check if bucket exists and create it if it doesn't. + Raises permission-related exceptions if user lacks necessary permissions. + """ + try: + self.s3_client.head_bucket(Bucket=self.bucket_name) + except self.s3_client.exceptions.ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code == "404": + try: + self.s3_client.create_bucket( + Bucket=self.bucket_name, + CreateBucketConfiguration={"LocationConstraint": self.region}, + ) + except self.s3_client.exceptions.ClientError as create_error: + raise Exception( + f"Failed to create bucket {self.bucket_name}: {str(create_error)}" + ) + elif error_code == "403": + raise Exception(f"No permission to access bucket {self.bucket_name}") + else: + raise + def download_folder(self, s3_prefix: str, local_path: Union[str, Path]) -> None: """ Download a zipped folder from S3 and extract it to local storage. diff --git a/tools/use_s3.py b/tools/use_s3.py new file mode 100644 index 0000000..5ef138e --- /dev/null +++ b/tools/use_s3.py @@ -0,0 +1,66 @@ +import click + +from second_brain.infrastructure.aws.s3 import S3Client + + +@click.group() +def cli() -> None: + """CLI tool for uploading and downloading folders to/from S3.""" + pass + + +@cli.command() +@click.argument("local_path") +@click.argument("bucket_name") +@click.option("--s3-prefix", default="", help="Optional S3 prefix (folder path)") +def upload(local_path: str, bucket_name: str, s3_prefix: str) -> None: + """Upload a local folder to S3 bucket. + + Args: + local_path: Path to the local folder to upload + bucket_name: Name of the S3 bucket + s3_prefix: Optional S3 prefix (folder path) + + Raises: + click.Abort: If upload fails or path is invalid + """ + try: + s3_client = S3Client(bucket_name) + s3_client.upload_folder(local_path, s3_prefix) + click.echo( + f"Successfully uploaded {local_path} to s3://{bucket_name}/{s3_prefix}" + ) + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + raise click.Abort() + + +@cli.command() +@click.argument("bucket_name") +@click.argument("s3_path") +@click.argument("local_path") +def download(bucket_name: str, s3_path: str, local_path: str) -> None: + """Download a zipped folder from S3 and extract it to local storage. + + Args: + bucket_name: Name of the S3 bucket + s3_path: Path to the zip file in S3 bucket + local_path: Local path where files should be extracted + + Raises: + click.Abort: If download fails or path is invalid + """ + + try: + s3_client = S3Client(bucket_name) + s3_client.download_folder(s3_path, local_path) + click.echo( + f"Successfully downloaded s3://{bucket_name}/{s3_path} to {local_path}" + ) + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + raise click.Abort() + + +if __name__ == "__main__": + cli()