Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
assets/model_weights.pth filter=lfs diff=lfs merge=lfs -text
assets/aircraft_model_weights.pth filter=lfs diff=lfs merge=lfs -text
assets/ship_model_weights.pth filter=lfs diff=lfs merge=lfs -text
assets/images/2_planes.tiff filter=lfs diff=lfs merge=lfs -text
assets/*.pth filter=lfs diff=lfs merge=lfs -text
assets/images/*.tiff filter=lfs diff=lfs merge=lfs -text
File renamed without changes.
3 changes: 3 additions & 0 deletions assets/ship_model_weights.pth
Git LFS file not shown
12 changes: 2 additions & 10 deletions conda/environment-py310.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023-2024 Amazon.com, Inc. or its affiliates.
# Copyright 2023-2025 Amazon.com, Inc. or its affiliates.

name: osml_models
channels:
Expand All @@ -7,12 +7,4 @@ dependencies:
- conda-forge::python=3.10.12
- conda-forge::gdal=3.7.2
- conda-forge::proj=9.3.0
- pip:
- json-logging==1.3.0
- boto3==1.34.104
- setuptools==68.0.0
- argparse==1.4.0
- flask==2.3.3
- waitress==2.1.2
- shapely==2.0.1
- matplotlib==3.7.2
- conda-forge::numpy=1.26.4
200 changes: 112 additions & 88 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,62 +1,118 @@
# Copyright 2023-2025 Amazon.com, Inc. or its affiliates.

# Use NVIDIA's CUDA base image
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu18.04 AS build-env
# =============================================================================
# Base image: Ubuntu 22.04 + CUDA 11.6.2 (devel)
# =============================================================================
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu18.04 as osml_model

# Set AWS to the maintainer
# Set maintainer label
LABEL maintainer="Amazon Web Services"

# Enable sudo access for the build session
# Advertise SageMaker multi-container capability
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true

# Use root for setup
USER root

# Update and install core build dependencies
# =============================================================================
# Install core build dependencies (incl. TIFF); clean apt lists in same layer
# =============================================================================
RUN apt-get update -y \
&& apt-get upgrade -y \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --fix-missing --no-install-recommends \
&& DEBIAN_FRONTEND=noninteractive \
apt-get install -y --fix-missing --no-install-recommends \
software-properties-common build-essential ca-certificates \
git make cmake wget unzip libtool automake \
zlib1g-dev libsqlite3-dev pkg-config sqlite3 libcurl4-gnutls-dev \
libtiff5-dev
libtiff5-dev \
&& rm -rf /var/lib/apt/lists/*

# Install Miniconda
# =============================================================================
# Miniconda
# =============================================================================
ARG MINICONDA_VERSION=Miniconda3-latest-Linux-x86_64
ARG MINICONDA_URL=https://repo.anaconda.com/miniconda/${MINICONDA_VERSION}.sh
ENV CONDA_TARGET_ENV=osml_model
RUN wget -c ${MINICONDA_URL} \
&& chmod +x ${MINICONDA_VERSION}.sh \
&& ./${MINICONDA_VERSION}.sh -b -f -p /opt/conda \
&& rm ${MINICONDA_VERSION}.sh \
&& ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh

# Set our new conda target lib dirs
ENV PATH=$PATH:/opt/conda/bin
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/
ENV PROJ_LIB=$PROJ_LIB:/opt/conda/share/proj
ENV CONDA_DIR=/opt/conda

# Copy the conda environment file and create the environment
COPY conda/environment-py310.yml environment.yml

# Accept Conda TOS before creating the environment
RUN wget -c ${MINICONDA_URL} \
&& chmod +x ${MINICONDA_VERSION}.sh \
&& ./${MINICONDA_VERSION}.sh -b -f -p ${CONDA_DIR} \
&& rm ${MINICONDA_VERSION}.sh \
&& ln -s ${CONDA_DIR}/etc/profile.d/conda.sh /etc/profile.d/conda.sh

# Configure environment variables used by common geospatial stacks
ENV CONDA_TARGET_ENV=osml_models
ENV PATH=/opt/conda/envs/${CONDA_TARGET_ENV}/bin:/opt/conda/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/conda/envs/${CONDA_TARGET_ENV}/lib:/opt/conda/envs/${CONDA_TARGET_ENV}/lib/gdal:${LD_LIBRARY_PATH}
ENV PROJ_LIB=/opt/conda/share/proj:$PROJ_LIB

# =============================================================================
# Conda environment (py310 + GDAL/PROJ + D2 Deps)
# =============================================================================
COPY conda/environment-py310.yml /tmp/environment.yml

# Create env and minimize image size
RUN conda config --set always_yes true && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

RUN conda env create -n ${CONDA_TARGET_ENV} --file environment.yml && \
conda clean -afy && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r && \
conda env create -f /tmp/environment.yml && \
find /opt/conda/ -follow -type f -name '*.a' -delete && \
find /opt/conda/ -follow -type f -name '*.pyc' -delete && \
find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
rm -rf /opt/conda/pkgs
rm -rf /opt/conda/pkgs && \
conda clean -afy

# =============================================================================
# Entry shim
# - Ensure conda env is active for RUN/CMD/ENTRYPOINT
# =============================================================================
RUN cat >/entry.sh <<'BASH'
#!/usr/bin/env bash
set -eo pipefail

# Activate conda env if available
if [ -f /opt/conda/etc/profile.d/conda.sh ]; then
. /opt/conda/etc/profile.d/conda.sh
conda activate "${CONDA_TARGET_ENV:-base}" >/dev/null 2>&1 || true
fi

# If a command was passed, exec it; otherwise start bash
if [ "$#" -gt 0 ]; then
exec "$@"
else
exec /bin/bash
fi
BASH
RUN chmod +x /entry.sh

# Make subsequent RUN use the activated env
SHELL ["/entry.sh", "/bin/bash", "-c"]


# Configure .bashrc to drop into a conda env and immediately activate our TARGET env
# Note this makes python3 default to our conda managed python version
RUN conda init && echo 'conda activate "${CONDA_TARGET_ENV:-base}"' >> ~/.bashrc

# =============================================================================
# PyTorch 1.12.0 (CUDA 16.0 wheels)
# =============================================================================
RUN python3 -m pip install --no-cache-dir \
torch==1.12.0+cu116 \
torchvision==0.13.0+cu116 \
-f https://download.pytorch.org/whl/torch_stable.html

# Activate the conda environment and install Python dependencies
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
python3 -m pip install --no-cache-dir \
# =============================================================================
# Detectron2 (build against target Torch/CUDA for Sagemaker Endpoints)
# - Set arch list for common AWS GPUs
# =============================================================================
ENV FORCE_CUDA=1
ARG TORCH_CUDA_ARCH_LIST="Pascal;Volta;Turing"
RUN python3 -m pip install --no-cache-dir \
"fvcore>=0.1.5,<0.1.6" \
iopath==0.1.8 \
pycocotools \
omegaconf==2.1.1 \
hydra-core==1.1.1 \
black==21.4b2 \
termcolor==1.1.0 \
matplotlib==3.5.2 \
yacs==0.1.8 \
Expand All @@ -65,69 +121,37 @@ RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} &&
tqdm==4.62.3 \
tensorboard==2.8.0 \
opencv-contrib-python-headless==4.8.0.76 \
setuptools==69.5.1
setuptools==69.5.1 \
'git+https://github.com/facebookresearch/detectron2.git'

# Install Torch with GPU support
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
python3 -m pip install --no-cache-dir \
torch==1.12.0+cu116 \
torchvision==0.13.0+cu116 \
-f https://download.pytorch.org/whl/torch_stable.html
# Final pip/conda cleanups
RUN conda clean -afy && python -m pip cache purge

# Install Detectron2
ENV FORCE_CUDA="1"
ARG TORCH_CUDA_ARCH_LIST="Pascal;Volta;Turing"
ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
RUN . /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_TARGET_ENV} && \
python3 -m pip install --no-cache-dir --no-deps 'git+https://github.com/facebookresearch/detectron2.git'

# Clean up unnecessary files
RUN apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
conda clean -afy && \
python -m pip cache purge

# Stage 2: Build the final image
FROM nvidia/cuda:11.6.2-cudnn8-runtime-ubuntu18.04 AS osml_model

LABEL maintainer="Amazon Web Services"
# Support multi-container SageMaker endpoints
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
USER root

# Copy only the necessary files from the build environment
COPY --from=build-env /opt/conda /opt/conda

# Set environment variables
ENV CONDA_TARGET_ENV="osml_model"
ENV PATH=$PATH:/opt/conda/bin
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/
ENV PROJ_LIB=$PROJ_LIB:/opt/conda/share/proj
ENV PYTHONUNBUFFERED=1

# Set up the conda environment
SHELL ["/opt/conda/bin/conda", "run", "--no-capture-output", "-n", "osml_model", "/bin/bash", "-c"]
RUN echo 'conda activate "${CONDA_TARGET_ENV:-base}"' >> ~/.bashrc

# Copy model source and install it
RUN mkdir /home/osml-models
COPY . /home/osml-models

# Install the application dependencies
# =============================================================================
# Application code
# =============================================================================
WORKDIR /home/osml-models
RUN chmod 777 --recursive .
RUN python3 -m pip install --no-cache-dir .
RUN mkdir -p /home/osml-models
COPY . /home/osml-models
RUN chmod -R 0777 . \
&& python3 -m pip install --no-cache-dir .

# Expose the necessary ports
# =============================================================================
# Runtime
# =============================================================================
EXPOSE 8080

# Disable health check
# Disable healthcheck (external orchestrator/SageMaker handles health)
HEALTHCHECK NONE

# Set up a user to run the container
RUN adduser --system --no-create-home --group model
RUN chown -R model:model ./
# Drop privileges for runtime
RUN adduser --system --no-create-home --group model \
&& chown -R model:model /home/osml-models \
&& mkdir -p /tmp/iopath_cache && chown model:model /tmp/iopath_cache
USER model

# Set the entry point
ENTRYPOINT python3 src/aws/osml/models/$MODEL_SELECTION/app.py
# Set iopath cache directory to avoid permission warnings
ENV IOPATH_CACHE_DIR=/tmp/iopath_cache

# Expand MODEL_SELECTION, and run app
ENTRYPOINT /entry.sh python /home/osml-models/src/aws/osml/models/${MODEL_SELECTION}/app.py
35 changes: 13 additions & 22 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,29 @@ project_urls =
Source = https://github.com/awslabs/osml-models
Tracker = https://github.com/awslabs/osml-models/issues
classifiers =
Programming Language :: Python :: 3
Programming Language :: Python :: 3.13
Operating System :: OS Independent
License :: OSI Approved :: MIT License

[options]
zip_safe = False
package_dir=
package_dir =
=src
packages=find_namespace:
python_requires = >=3.9
packages = find_namespace:
python_requires = >=3.10
include_package_data = True

install_requires =
json-logging==1.3.0
boto3==1.34.104
setuptools==68.0.0
argparse==1.4.0
flask==2.3.3
waitress==2.1.2
shapely==2.0.1
matplotlib==3.7.2
json-logging==1.5.1
boto3==1.40.36
Flask==3.1.2
Waitress==3.0.2
Shapely==2.1.1
matplotlib==3.10.6
tabulate==0.9.0
yacs==0.1.8

[options.packages.find]
where = src
exclude =
test

[options.package_data]
package_data =
= ["py.typed"]

[options.extras_require]
gdal =
gdal>=3.7.0
test =
tox
35 changes: 30 additions & 5 deletions src/aws/osml/models/aircraft/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023-2024 Amazon.com, Inc. or its affiliates.
# Copyright 2023-2025 Amazon.com, Inc. or its affiliates.

import json
import os
Expand Down Expand Up @@ -30,6 +30,9 @@
# Create our default flask app
app = build_flask_app(logger)

# Test logging
app.logger.info("Starting aircraft model application...")


def build_predictor() -> DefaultPredictor:
"""
Expand All @@ -41,7 +44,7 @@ def build_predictor() -> DefaultPredictor:
# If we can't find a gpu
if not torch.cuda.is_available():
cfg.MODEL.DEVICE = "cpu"
app.logger.warning("GPU not found, running in CPU mode!")
app.logger.info("GPU not found, running in CPU mode!")
# Set to only expect one class (aircraft)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
# Set the detection threshold to 90%
Expand All @@ -50,11 +53,32 @@ def build_predictor() -> DefaultPredictor:
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
# Path to the model weights
cfg.MODEL.WEIGHTS = os.getenv(
os.path.join("MODEL_WEIGHTS"), os.path.join("/home/osml-models/assets/", "model_weights.pth")
os.path.join("MODEL_WEIGHTS"), os.path.join("/home/osml-models/assets/", "aircraft_model_weights.pth")
)

# Build the detectron2 default predictor
return DefaultPredictor(cfg)
# Build the detectron2 default predictor with error handling for CPU mode
try:
# Suppress checkpoint loading warnings for expected shape mismatches
import logging

checkpoint_logger = logging.getLogger("fvcore.common.checkpoint")
original_level = checkpoint_logger.level
checkpoint_logger.setLevel(logging.ERROR)

predictor = DefaultPredictor(cfg)

# Restore original logging level
checkpoint_logger.setLevel(original_level)

return predictor
except RuntimeError as e:
if "NVIDIA driver" in str(e) or "CUDA" in str(e):
app.logger.warning(f"CUDA error detected, forcing CPU mode: {e}")
# Force CPU mode and try again
cfg.MODEL.DEVICE = "cpu"
return DefaultPredictor(cfg)
else:
raise e


def mask_to_polygon(mask: torch.Tensor) -> List[List[float]]:
Expand Down Expand Up @@ -202,6 +226,7 @@ def request_to_instances(req: Request) -> Union[Instances, None]:

# Build our aircraft predictor
aircraft_predictor = build_predictor()
app.logger.info("Aircraft model predictor initialized successfully!")


@app.route("/ping", methods=["GET"])
Expand Down
Loading
Loading