awslabs · drduhe · Sep 17, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,5 @@
-assets/model_weights.pth filter=lfs diff=lfs merge=lfs -text
+assets/aircraft_model_weights.pth filter=lfs diff=lfs merge=lfs -text
+assets/ship_model_weights.pth filter=lfs diff=lfs merge=lfs -text
 assets/images/2_planes.tiff filter=lfs diff=lfs merge=lfs -text
+assets/*.pth filter=lfs diff=lfs merge=lfs -text
+assets/images/*.tiff filter=lfs diff=lfs merge=lfs -text
diff --git a/assets/model_weights.pth → assets/aircraft_model_weights.pth b/assets/model_weights.pth → assets/aircraft_model_weights.pth
diff --git a/assets/ship_model_weights.pth b/assets/ship_model_weights.pth
diff --git a/conda/environment-py310.yml b/conda/environment-py310.yml
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Amazon.com, Inc. or its affiliates.
+# Copyright 2023-2025 Amazon.com, Inc. or its affiliates.
 
 name: osml_models
 channels:
@@ -7,12 +7,4 @@ dependencies:
   - conda-forge::python=3.10.12
   - conda-forge::gdal=3.7.2
   - conda-forge::proj=9.3.0
-  - pip:
-      - json-logging==1.3.0
-      - boto3==1.34.104
-      - setuptools==68.0.0
-      - argparse==1.4.0
-      - flask==2.3.3
-      - waitress==2.1.2
-      - shapely==2.0.1
-      - matplotlib==3.7.2
+  - conda-forge::numpy=1.26.4
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,62 +1,118 @@
 # Copyright 2023-2025 Amazon.com, Inc. or its affiliates.
 
-# Use NVIDIA's CUDA base image
-FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu18.04 AS build-env
+# =============================================================================
+# Base image: Ubuntu 22.04 + CUDA 11.6.2 (devel)
+# =============================================================================
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu18.04 as osml_model
 
-# Set AWS to the maintainer
+# Set maintainer label
 LABEL maintainer="Amazon Web Services"
 
-# Enable sudo access for the build session
+# Advertise SageMaker multi-container capability
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+
+# Use root for setup
 USER root
 
-# Update and install core build dependencies
+# =============================================================================
+# Install core build dependencies (incl. TIFF); clean apt lists in same layer
+# =============================================================================
 RUN apt-get update -y \
     && apt-get upgrade -y \
-    && DEBIAN_FRONTEND=noninteractive apt-get install -y --fix-missing --no-install-recommends \
+    && DEBIAN_FRONTEND=noninteractive \
+            apt-get install -y --fix-missing --no-install-recommends \
             software-properties-common build-essential ca-certificates \
             git make cmake wget unzip libtool automake \
             zlib1g-dev libsqlite3-dev pkg-config sqlite3 libcurl4-gnutls-dev \
-            libtiff5-dev
+            libtiff5-dev \
+    && rm -rf /var/lib/apt/lists/*
 
-# Install Miniconda
+# =============================================================================
+# Miniconda
+# =============================================================================
 ARG MINICONDA_VERSION=Miniconda3-latest-Linux-x86_64
 ARG MINICONDA_URL=https://repo.anaconda.com/miniconda/${MINICONDA_VERSION}.sh
-ENV CONDA_TARGET_ENV=osml_model
-RUN wget -c ${MINICONDA_URL} \
-    && chmod +x ${MINICONDA_VERSION}.sh \
-    && ./${MINICONDA_VERSION}.sh -b -f -p /opt/conda \
-    && rm ${MINICONDA_VERSION}.sh \
-    && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
-
-# Set our new conda target lib dirs
-ENV PATH=$PATH:/opt/conda/bin
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/
-ENV PROJ_LIB=$PROJ_LIB:/opt/conda/share/proj
+ENV CONDA_DIR=/opt/conda
 
-# Copy the conda environment file and create the environment
-COPY conda/environment-py310.yml environment.yml
-
-# Accept Conda TOS before creating the environment
+RUN wget -c ${MINICONDA_URL} \
+ && chmod +x ${MINICONDA_VERSION}.sh \
+ && ./${MINICONDA_VERSION}.sh -b -f -p ${CONDA_DIR} \
+ && rm ${MINICONDA_VERSION}.sh \
+ && ln -s ${CONDA_DIR}/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+
+# Configure environment variables used by common geospatial stacks
+ENV CONDA_TARGET_ENV=osml_models
+ENV PATH=/opt/conda/envs/${CONDA_TARGET_ENV}/bin:/opt/conda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/conda/envs/${CONDA_TARGET_ENV}/lib:/opt/conda/envs/${CONDA_TARGET_ENV}/lib/gdal:${LD_LIBRARY_PATH}
+ENV PROJ_LIB=/opt/conda/share/proj:$PROJ_LIB
+
+# =============================================================================
+# Conda environment (py310 + GDAL/PROJ + D2 Deps)
+# =============================================================================
+COPY conda/environment-py310.yml /tmp/environment.yml
+
+# Create env and minimize image size
 RUN conda config --set always_yes true && \
     conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
-    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
-
-RUN conda env create -n ${CONDA_TARGET_ENV} --file environment.yml && \
-    conda clean -afy && \
+    conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r && \
+    conda env create -f /tmp/environment.yml && \
     find /opt/conda/ -follow -type f -name '*.a' -delete && \
     find /opt/conda/ -follow -type f -name '*.pyc' -delete && \
     find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
-    rm -rf /opt/conda/pkgs
+    rm -rf /opt/conda/pkgs && \
+    conda clean -afy
+
+# =============================================================================
+# Entry shim
+# - Ensure conda env is active for RUN/CMD/ENTRYPOINT
+# =============================================================================
+RUN cat >/entry.sh <<'BASH'
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Activate conda env if available
+if [ -f /opt/conda/etc/profile.d/conda.sh ]; then
+  . /opt/conda/etc/profile.d/conda.sh
+  conda activate "${CONDA_TARGET_ENV:-base}" >/dev/null 2>&1 || true
+fi
+
+# If a command was passed, exec it; otherwise start bash
+if [ "$#" -gt 0 ]; then
+  exec "$@"
+else
+  exec /bin/bash
+fi
+BASH
+RUN chmod +x /entry.sh
+
+# Make subsequent RUN use the activated env
+SHELL ["/entry.sh", "/bin/bash", "-c"]
+
+
+# Configure .bashrc to drop into a conda env and immediately activate our TARGET env
+# Note this makes python3 default to our conda managed python version
+RUN conda init && echo 'conda activate "${CONDA_TARGET_ENV:-base}"' >>  ~/.bashrc
+
+# =============================================================================
+# PyTorch 1.12.0 (CUDA 16.0 wheels)
+# =============================================================================
+RUN python3 -m pip install --no-cache-dir \
+    torch==1.12.0+cu116 \
+    torchvision==0.13.0+cu116 \
+    -f https://download.pytorch.org/whl/torch_stable.html
 
-# Activate the conda environment and install Python dependencies
-RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
-    python3 -m pip install --no-cache-dir \
+# =============================================================================
+# Detectron2 (build against target Torch/CUDA for Sagemaker Endpoints)
+# - Set arch list for common AWS GPUs
+# =============================================================================
+ENV FORCE_CUDA=1
+ARG TORCH_CUDA_ARCH_LIST="Pascal;Volta;Turing"
+RUN python3 -m pip install --no-cache-dir \
     "fvcore>=0.1.5,<0.1.6" \
     iopath==0.1.8 \
     pycocotools \
     omegaconf==2.1.1 \
     hydra-core==1.1.1 \
-    black==21.4b2 \
     termcolor==1.1.0 \
     matplotlib==3.5.2 \
     yacs==0.1.8 \
@@ -65,69 +121,37 @@ RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} &&
     tqdm==4.62.3 \
     tensorboard==2.8.0 \
     opencv-contrib-python-headless==4.8.0.76 \
-    setuptools==69.5.1
+    setuptools==69.5.1 \
+    'git+https://github.com/facebookresearch/detectron2.git'
 
-# Install Torch with GPU support
-RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
-    python3 -m pip install --no-cache-dir \
-    torch==1.12.0+cu116 \
-    torchvision==0.13.0+cu116 \
-    -f https://download.pytorch.org/whl/torch_stable.html
+# Final pip/conda cleanups
+RUN conda clean -afy && python -m pip cache purge
 
-# Install Detectron2
-ENV FORCE_CUDA="1"
-ARG TORCH_CUDA_ARCH_LIST="Pascal;Volta;Turing"
-ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
-RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
-    python3 -m pip install --no-cache-dir --no-deps 'git+https://github.com/facebookresearch/detectron2.git'
-
-# Clean up unnecessary files
-RUN apt-get clean && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    conda clean -afy && \
-    python -m pip cache purge
-
-# Stage 2: Build the final image
-FROM nvidia/cuda:11.6.2-cudnn8-runtime-ubuntu18.04 AS osml_model
-
-LABEL maintainer="Amazon Web Services"
-# Support multi-container SageMaker endpoints
-LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
-USER root
-
-# Copy only the necessary files from the build environment
-COPY --from=build-env /opt/conda /opt/conda
-
-# Set environment variables
-ENV CONDA_TARGET_ENV="osml_model"
-ENV PATH=$PATH:/opt/conda/bin
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/
-ENV PROJ_LIB=$PROJ_LIB:/opt/conda/share/proj
-ENV PYTHONUNBUFFERED=1
-
-# Set up the conda environment
-SHELL ["/opt/conda/bin/conda", "run", "--no-capture-output", "-n", "osml_model", "/bin/bash", "-c"]
-RUN echo 'conda activate "${CONDA_TARGET_ENV:-base}"' >> ~/.bashrc
-
-# Copy model source and install it
-RUN mkdir /home/osml-models
-COPY . /home/osml-models
-
-# Install the application dependencies
+# =============================================================================
+# Application code
+# =============================================================================
 WORKDIR /home/osml-models
-RUN chmod 777 --recursive .
-RUN python3 -m pip install --no-cache-dir .
+RUN mkdir -p /home/osml-models
+COPY . /home/osml-models
+RUN chmod -R 0777 . \
+ && python3 -m pip install --no-cache-dir .
 
-# Expose the necessary ports
+# =============================================================================
+# Runtime
+# =============================================================================
 EXPOSE 8080
 
-# Disable health check
+# Disable healthcheck (external orchestrator/SageMaker handles health)
 HEALTHCHECK NONE
 
-# Set up a user to run the container
-RUN adduser --system --no-create-home --group model
-RUN chown -R model:model ./
+# Drop privileges for runtime
+RUN adduser --system --no-create-home --group model \
+ && chown -R model:model /home/osml-models \
+ && mkdir -p /tmp/iopath_cache && chown model:model /tmp/iopath_cache
 USER model
 
-# Set the entry point
-ENTRYPOINT python3 src/aws/osml/models/$MODEL_SELECTION/app.py
+# Set iopath cache directory to avoid permission warnings
+ENV IOPATH_CACHE_DIR=/tmp/iopath_cache
+
+# Expand MODEL_SELECTION, and run app
+ENTRYPOINT /entry.sh python /home/osml-models/src/aws/osml/models/${MODEL_SELECTION}/app.py
diff --git a/setup.cfg b/setup.cfg
@@ -27,38 +27,29 @@ project_urls =
     Source = https://github.com/awslabs/osml-models
     Tracker = https://github.com/awslabs/osml-models/issues
 classifiers =
-    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.13
     Operating System :: OS Independent
+    License :: OSI Approved :: MIT License
 
 [options]
 zip_safe = False
-package_dir=
+package_dir =
     =src
-packages=find_namespace:
-python_requires = >=3.9
+packages = find_namespace:
+python_requires = >=3.10
 include_package_data = True
 
 install_requires =
-    json-logging==1.3.0
-    boto3==1.34.104
-    setuptools==68.0.0
-    argparse==1.4.0
-    flask==2.3.3
-    waitress==2.1.2
-    shapely==2.0.1
-    matplotlib==3.7.2
+    json-logging==1.5.1
+    boto3==1.40.36
+    Flask==3.1.2
+    Waitress==3.0.2
+    Shapely==2.1.1
+    matplotlib==3.10.6
+    tabulate==0.9.0
+    yacs==0.1.8
 
 [options.packages.find]
 where = src
 exclude =
     test
-
-[options.package_data]
-package_data =
-    = ["py.typed"]
-
-[options.extras_require]
-gdal =
-    gdal>=3.7.0
-test =
-    tox
diff --git a/src/aws/osml/models/aircraft/app.py b/src/aws/osml/models/aircraft/app.py
@@ -1,4 +1,4 @@
-#  Copyright 2023-2024 Amazon.com, Inc. or its affiliates.
+#  Copyright 2023-2025 Amazon.com, Inc. or its affiliates.
 
 import json
 import os
@@ -30,6 +30,9 @@
 # Create our default flask app
 app = build_flask_app(logger)
 
+# Test logging
+app.logger.info("Starting aircraft model application...")
+
 
 def build_predictor() -> DefaultPredictor:
     """
@@ -41,7 +44,7 @@ def build_predictor() -> DefaultPredictor:
     # If we can't find a gpu
     if not torch.cuda.is_available():
         cfg.MODEL.DEVICE = "cpu"
-        app.logger.warning("GPU not found, running in CPU mode!")
+        app.logger.info("GPU not found, running in CPU mode!")
     # Set to only expect one class (aircraft)
     cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
     # Set the detection threshold to 90%
@@ -50,11 +53,32 @@ def build_predictor() -> DefaultPredictor:
     cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
     # Path to the model weights
     cfg.MODEL.WEIGHTS = os.getenv(
-        os.path.join("MODEL_WEIGHTS"), os.path.join("/home/osml-models/assets/", "model_weights.pth")
+        os.path.join("MODEL_WEIGHTS"), os.path.join("/home/osml-models/assets/", "aircraft_model_weights.pth")
     )
 
-    # Build the detectron2 default predictor
-    return DefaultPredictor(cfg)
+    # Build the detectron2 default predictor with error handling for CPU mode
+    try:
+        # Suppress checkpoint loading warnings for expected shape mismatches
+        import logging
+
+        checkpoint_logger = logging.getLogger("fvcore.common.checkpoint")
+        original_level = checkpoint_logger.level
+        checkpoint_logger.setLevel(logging.ERROR)
+
+        predictor = DefaultPredictor(cfg)
+
+        # Restore original logging level
+        checkpoint_logger.setLevel(original_level)
+
+        return predictor
+    except RuntimeError as e:
+        if "NVIDIA driver" in str(e) or "CUDA" in str(e):
+            app.logger.warning(f"CUDA error detected, forcing CPU mode: {e}")
+            # Force CPU mode and try again
+            cfg.MODEL.DEVICE = "cpu"
+            return DefaultPredictor(cfg)
+        else:
+            raise e
 
 
 def mask_to_polygon(mask: torch.Tensor) -> List[List[float]]:
@@ -202,6 +226,7 @@ def request_to_instances(req: Request) -> Union[Instances, None]:
 
 # Build our aircraft predictor
 aircraft_predictor = build_predictor()
+app.logger.info("Aircraft model predictor initialized successfully!")
 
 
 @app.route("/ping", methods=["GET"])