From eb2321a97cc83834d75678f53abb4f12cc57b958 Mon Sep 17 00:00:00 2001 From: "Peter St. John" Date: Tue, 21 Jan 2025 11:25:05 -0700 Subject: [PATCH] remove the apex and TE build steps from our docker container (#611) The newer versions of the pytorch base image have up-to-date Apex and TransformerEngine libraries, we may no longer need to be re-building these from source. --------- Signed-off-by: Peter St. John --- Dockerfile | 17 ----------------- .../tests/bionemo/esm2/model/test_model.py | 6 +++++- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7a760a4dd..51f3aa09b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,23 +22,6 @@ WORKDIR /build ARG MAX_JOBS=4 ENV MAX_JOBS=${MAX_JOBS} -# See NeMo readme for the latest tested versions of these libraries -ARG APEX_COMMIT=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c -RUN git clone https://github.com/NVIDIA/apex.git && \ - cd apex && \ - git checkout ${APEX_COMMIT} && \ - pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir \ - --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm" - -# Transformer Engine pre-1.7.0. 1.7 standardizes the meaning of bits in the attention mask to match -ARG TE_COMMIT=2215fa5c7557b66034068816020f9f611019e457 -RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - git fetch origin ${TE_COMMIT} && \ - git checkout FETCH_HEAD && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . - # Install core apt packages. RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \ diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py index 70991e098..7d0d20b46 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py @@ -164,7 +164,11 @@ def test_esm2_650m_checkpoint(esm2_model): extra_keys = new_keys.difference(old_keys) extra_non_null_keys = { - k for k in extra_keys if new_state_dict[k] is not None and not isinstance(new_state_dict[k], io.BytesIO) + k + for k in extra_keys + if not k.endswith("._extra_state") + and new_state_dict[k] is not None + and not isinstance(new_state_dict[k], io.BytesIO) } assert not extra_non_null_keys, "There are new keys that have state that is missing from the old checkpoint."