allenai · vwxyzjn · Sep 5, 2024 · Sep 5, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.github/workflows/push-image-olmo.yml b/.github/workflows/push-image-olmo.yml
@@ -0,0 +1,81 @@
+# This is an example workflow file.
+#
+# When you add a new image, copy this file and then change all mentions of "hello-world" with
+# the name of your new image.
+#
+# Read through the rest of the comments in this file to figure out how it works, and what else
+# you need to change.
+name: build_open_instruct_olmo
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+on:
+  push:
+    # Run this workflow anytime a push updates one of the files in the image's directory
+    # (other than the README), and anytime there's a new release tag for this image.
+    paths:
+      - 'open_instruct/**'
+      - '!open_instruct/README.md'
+      - 'requirements-olmo.txt'
+      - 'Dockerfile.olmo'
+      - '.github/workflows/push-image-olmo.yml'
+      # Note, add .olmo dockerfile + requirements if adding auto build to those
+    branches: [main]
+  # pull_request: # note, comment this out for running on every push
+  #   # Also run on PRs that update the files in the image's directory (other than README).
+  #   branches: [main]
+  #   paths:
+  #     - 'open_instruct/**'
+  #     - '!open_instruct/README.md'
+  #     - 'requirements-olmo.txt'
+  #     - 'Dockerfile.olmo'
+  workflow_dispatch:  # This allows us to manually trigger a build through the GitHub UI.
+
+env:
+  DOCKER_BUILDKIT: "1"
+
+jobs:
+  build:
+    name: open_instruct
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          repository: allenai/oe-eval-internal
+          path: './oe-eval-internal'
+          ssh-key: ${{ secrets.OE_EVAL_GIT_CLONE_ACCESS_PRIVATE_SSH_DEPLOY_KEY }}
+
+      - name: Setup environment
+        uses: ./.github/actions/setup
+        with:
+          beaker_token: ${{ secrets.BEAKER_TOKEN }}
+          # ghcr_token: ${{ secrets.GHCR_TOKEN }}
+          # ghcr_user: ${{ secrets.GHCR_USER }}
+
+      # big images fail, trying this
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache /usr/share/dotnet "$AGENT_TOOLSDIRECTORY"
+
+      - name: Build image
+        run: |
+          docker build \
+              --build-arg BUILDKIT_INLINE_CACHE=1 \
+              --build-arg CUDA=12.1.0 --build-arg \
+              TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
+              -f Dockerfile.olmo . \
+              -t open_instruct_olmo
+
+      - name: Check image
+        run: |
+          docker run --rm open_instruct_olmo
+      - name: Push image
+        # if: github.event_name != 'pull_request'
+        uses: ./.github/actions/push
+        with:
+          image: open_instruct_olmo  # this is the tag of the image we just built in the previous step
+          beaker: open_instruct_olmo_auto  # this is the name of the image on Beaker
+          latest: true  # this flag says we should also push this as the 'latest' version to GHCR
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
@@ -44,8 +44,6 @@ jobs:
     timeout-minutes: 60
     if: (github.event_name != 'workflow_run') || (github.event.workflow_run.conclusion == 'success')
     steps:
-      - uses: actions/checkout@v3
-
       - uses: actions/checkout@v3
         with:
           repository: allenai/oe-eval-internal
@@ -69,7 +67,6 @@ jobs:
               --build-arg BUILDKIT_INLINE_CACHE=1 \
               --build-arg CUDA=12.1.0 --build-arg \
               TARGET=cudnn8-devel --build-arg DIST=ubuntu20.04 \
-              --build-arg REQUIRE=requirements.txt . \
               -t open_instruct
 
 

diff --git a/Dockerfile b/Dockerfile
@@ -1,84 +1,4 @@
-ARG CUDA
-ARG DIST
-ARG TARGET
-FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
-
-ARG DEBIAN_FRONTEND="noninteractive"
-ENV TZ="America/Los_Angeles"
-
-# Install base tools.
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    jq \
-    language-pack-en \
-    make \
-    sudo \
-    unzip \
-    vim \
-    wget \
-    parallel \
-    iputils-ping \
-    tmux
-
-ARG BEAKER_VERSION
-RUN curl --silent \
-    --connect-timeout 5 \
-    --max-time 10 \
-    --retry 5 \
-    --retry-delay 0 \
-    --retry-max-time 40 \
-    --output beaker.tar.gz \
-    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
-    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
-    && rm beaker.tar.gz
-
-# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
-# puts the right NVIDIA things in the right place (that THOR requires).
-ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
-
-# Install conda. We give anyone in the users group the ability to run
-# conda commands and install packages in the base (default) environment.
-# Things installed into the default environment won't persist, but we prefer
-# convenience in this case and try to make sure the user is aware of this
-# with a message that's printed when the session starts.
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
-    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
-        | sha256sum --check \
-    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
-    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
-
-ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-
-# Install a few additional utilities via pip
-RUN /opt/miniconda3/bin/pip install --no-cache-dir \
-    gpustat \
-    jupyter \
-    beaker-gantry \
-    oocmap
-
-# Ensure users can modify their container environment.
-RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-
-# Make the base image friendlier for interactive workloads. This makes things like the man command
-# work.
-RUN yes | unminimize
-
-# Install MLNX OFED user-space drivers
-# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
-ENV MOFED_VER 5.8-1.1.2.1
-ENV OS_VER ubuntu20.04
-ENV PLATFORM x86_64
-RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
-    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
-    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
-    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
-    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
-
-# The -l flag makes bash act as a login shell and load /etc/profile, etc.
-ENTRYPOINT ["bash", "-l"]
+FROM ghcr.io/allenai/cuda:12.1-cudnn8-dev-ubuntu20.04-v1.2.116 
 
 WORKDIR /stage/
 
@@ -106,6 +26,7 @@ COPY configs configs
 COPY scripts scripts
 COPY mason.py mason.py
 RUN chmod +x scripts/*
+RUN pip cache purge
 
 # for interactive session
 RUN chmod -R 777 /stage/
diff --git a/Dockerfile.olmo b/Dockerfile.olmo
@@ -0,0 +1,121 @@
+ARG CUDA
+ARG DIST
+ARG TARGET
+FROM --platform=linux/amd64 nvidia/cuda:${CUDA}-${TARGET}-${DIST}
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    parallel \
+    iputils-ping \
+    tmux
+
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place (that THOR requires).
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+
+# Install conda. We give anyone in the users group the ability to run
+# conda commands and install packages in the base (default) environment.
+# Things installed into the default environment won't persist, but we prefer
+# convenience in this case and try to make sure the user is aware of this
+# with a message that's printed when the session starts.
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
+    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
+        | sha256sum --check \
+    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
+    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+
+ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install a few additional utilities via pip
+RUN /opt/miniconda3/bin/pip install --no-cache-dir \
+    gpustat \
+    jupyter \
+    beaker-gantry \
+    oocmap
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Make the base image friendlier for interactive workloads. This makes things like the man command
+# work.
+RUN yes | unminimize
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 5.8-1.1.2.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+# The -l flag makes bash act as a login shell and load /etc/profile, etc.
+ENTRYPOINT ["bash", "-l"]
+
+WORKDIR /stage/
+
+# TODO When updating flash-attn or torch in the future, make sure to update the version in the requirements.txt file. 
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+COPY requirements-olmo.txt .
+RUN pip install --upgrade pip "setuptools<70.0.0" wheel 
+# TODO, unpin setuptools when this issue in flash attention is resolved
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+RUN pip install packaging
+RUN pip install flash-attn==2.5.9.post1 --no-build-isolation
+# for newest olmo's, move to requirements when ai2-olmo supports torch 2.4
+# core is a dependency of ai2-olmo
+RUN pip install ai2-olmo-core==0.1.0 omegaconf
+# RUN pip install ai2-olmo>=0.5.0 --no-deps
+# TODO Update Once this is merged https://github.com/allenai/OLMo/pull/719, then next release
+RUN pip install git+https://github.com/allenai/OLMo.git@47f8f5abb40eb100c6623be12e1648c841b2ab99 --no-deps
+RUN pip install -r requirements-olmo.txt
+
+RUN pip install git+https://github.com/AkshitaB/vllm.git
+RUN pip install vllm-flash-attn
+
+
+# NLTK download
+RUN python -m nltk.downloader punkt
+COPY open_instruct open_instruct
+COPY oe-eval-internal oe-eval-internal
+
+# install the package in editable mode
+COPY pyproject.toml .
+RUN pip install -e .
+COPY .git/ ./.git/
+COPY eval eval
+COPY configs configs
+COPY scripts scripts
+COPY mason.py mason.py
+RUN chmod +x scripts/*
+
+# for interactive session
+RUN chmod -R 777 /stage/
diff --git a/configs/beaker_configs/beaker_configs/ray_node_setup.sh b/configs/beaker_configs/beaker_configs/ray_node_setup.sh
@@ -0,0 +1,21 @@
+export CURRENT_DATETIME=$(python -c "import datetime; import pytz; print(datetime.datetime.now(pytz.timezone('America/Los_Angeles')).strftime('%m%d%y_%H%M%S'))")
+export PYTHONPATH=$REPO_PATH
+export PATH="/root/.local/bin:$PATH"
+
+
+echo CURRENT_DATETIME=$CURRENT_DATETIME
+echo PYTHONPATH=$PYTHONPATH
+echo PATH=$PATH
+
+# python3 -c "import os, ray; print(os.path.dirname(ray.__file__))"
+
+RAY_NODE_PORT=8888
+ray stop --force
+
+if [ "$BEAKER_REPLICA_RANK" == "0" ]; then
+    echo "Starting Ray head node"
+    ray start --head --port=$RAY_NODE_PORT
+else
+    echo "Starting Ray worker node $BEAKER_REPLICA_RANK"
+    ray start --address="${BEAKER_LEADER_REPLICA_HOSTNAME}:${RAY_NODE_PORT}" --block
+fi