CI: on-the-fly data generation for regression test determinism (#260)

* CI on-the-fly data generation for regression tests * delete old reg test data * util_test.py option to create test data for specific git revision skip instead of fail regression tests if data not found register pytest markers in pytest.ini * use sha instead of branch name if on a detached HEAD fixed typo * CI: save model list util_test to ensure no model names leak from test to reference * CI: use marker instead of name filter to collect model names from pytest * CI: use manual revision for checkout action * util_test: avoid nested exception to make sure working tree is being restored on failure * cache venv with minor version specific python ensure durations file exists * only pop stash if changes were stashed * keep naming for env and durations cache the same Co-authored-by: Romain Beaumont <[email protected]>
mlfoundations · Dec 9, 2022 · 5d68361 · 5d68361
1 parent db8a924
commit 5d68361
Show file tree

Hide file tree

Showing 100 changed files with 198 additions and 59 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,6 +20,16 @@ on:
     - '.gitignore'
     - 'docs/**'
   workflow_dispatch:
+    inputs:
+      manual_revision_reference:
+        required: false
+        type: string
+      manual_revision_test:
+        required: false
+        type: string
+
+env:
+  REVISION_REFERENCE: 9d31b2ec4df6d8228f370ff20c8267ec6ba39383 # v2.7.0 + pretrained_hf param
 
 jobs:
   Tests:
@@ -32,7 +42,11 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        ref: ${{ inputs.manual_revision_test }}
     - name: Set up Python ${{ matrix.python }}
+      id: pythonsetup
       uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python }}
@@ -41,12 +55,12 @@ jobs:
       uses: actions/cache@v3
       with:
         path: .env
-        key: venv-${{ matrix.os }}-${{ matrix.python }}-${{ hashFiles('requirements*') }}
-    - name: Pytest durations
+        key: venv-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ hashFiles('requirements*') }}
+    - name: Pytest durations cache
       uses: actions/cache@v3
       with:
         path: .test_durations
-        key: test_durations-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.job }}-${{ github.run_id }}
+        key: test_durations-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ matrix.job }}-${{ github.run_id }}
         restore-keys: test_durations-0-
     - name: Setup
       if: steps.venv-cache.outputs.cache-hit != 'true'
@@ -56,16 +70,49 @@ jobs:
         make install
         make install-test
         make install-training
+    - name: Prepare test data
+      run: |
+        source .env/bin/activate
+        python -m pytest \
+          --quiet --co \
+          --splitting-algorithm least_duration \
+          --splits ${{ matrix.job_num }} \
+          --group ${{ matrix.job }} \
+          -m regression_test \
+          tests \
+          | head -n -2 | grep -Po 'test_inference_with_data\[\K[^]]*(?=])' \
+          > models_gh_runner.txt
+        if [ -n "${{ inputs.manual_revision_reference }}" ]; then
+          REVISION_REFERENCE=${{ inputs.manual_revision_reference }}
+        fi
+        python tests/util_test.py \
+          --save_model_list models_gh_runner.txt \
+          --model_list models_gh_runner.txt \
+          --git_revision $REVISION_REFERENCE
     - name: Unit tests
       run: |
         source .env/bin/activate
+        touch .test_durations
+        cp .test_durations durations_1
+        mv .test_durations durations_2
         python -m pytest \
           -x -s -v \
           --splitting-algorithm least_duration \
           --splits ${{ matrix.job_num }} \
           --group ${{ matrix.job }} \
           --store-durations \
+          --durations-path durations_1 \
+          --clean-durations \
+          -m "not regression_test" \
+          tests
+        OPEN_CLIP_TEST_REG_MODELS=models_gh_runner.txt python -m pytest \
+          -x -s -v \
+          --store-durations \
+          --durations-path durations_2 \
+          --clean-durations \
+          -m "regression_test" \
           tests
+        jq -s -S 'add' durations_* > .test_durations
     - name: Collect pytest durations
       uses: actions/upload-artifact@v3
       with:

diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ wandb/
 models/
 features/
 results/
+
+tests/data/
 *.pt
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -111,15 +111,26 @@ If you want to make changes to contribute code, you can close openclip then run
 
 Install pip PyTorch as per https://pytorch.org/get-started/locally/
 
+You may run `make install-training` to install training deps
+
+#### Testing
 
 Test can be run with `make install-test` then `make test`
 
 `python -m pytest -x -s -v tests -k "training"` to run a specific test
 
-When introducing new models, `python tests/util_test.py --model=xlm-roberta-large-ViT-H-14` can generate new output expected data.
-
-You may run `make install-training` to install training deps
+Running regression tests against a specific git revision or tag:
+1. Generate testing data
+    ```sh
+    python tests/util_test.py --model RN50 RN101 --save_model_list models.txt --git_revision 9d31b2ec4df6d8228f370ff20c8267ec6ba39383
+    ```
+    **_WARNING_: This will invoke git and modify your working tree, but will reset it to the current state after data has been generated! \
+    Don't modify your working tree while test data is being generated this way.**
 
+2. Run regression tests
+    ```sh
+    OPEN_CLIP_TEST_REG_MODELS=models.txt python -m pytest -x -s -v -m regression_test
+    ```
 
 ### Sample single-process running code:
 
@@ -513,7 +524,7 @@ quantifies robustness as accuracy beyond this baseline, i.e., how far a model li
 Even though the CLIP models trained with
 this codebase achieve much lower accuracy than those trained by OpenAI, our models still lie on the same
 trend of improved effective robustness (the purple line). Therefore, we can study what makes
-CLIP robust without requiring industrial-scale compute. 
+CLIP robust without requiring industrial-scale compute.
 
 For more information on effective robustness, please see:
 

diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    regression_test
diff --git a/tests/data/input/random_image_224_224.pt b/tests/data/input/random_image_224_224.pt
diff --git a/tests/data/input/random_image_240_240.pt b/tests/data/input/random_image_240_240.pt
diff --git a/tests/data/input/random_image_256_256.pt b/tests/data/input/random_image_256_256.pt
diff --git a/tests/data/input/random_image_280_280.pt b/tests/data/input/random_image_280_280.pt
diff --git a/tests/data/input/random_image_288_288.pt b/tests/data/input/random_image_288_288.pt
diff --git a/tests/data/input/random_image_320_320.pt b/tests/data/input/random_image_320_320.pt
diff --git a/tests/data/input/random_image_336_336.pt b/tests/data/input/random_image_336_336.pt
diff --git a/tests/data/input/random_image_384_384.pt b/tests/data/input/random_image_384_384.pt
diff --git a/tests/data/input/random_image_448_448.pt b/tests/data/input/random_image_448_448.pt
diff --git a/tests/data/input/random_text.pt b/tests/data/input/random_text.pt
diff --git a/tests/data/output/RN101-quickgelu_None_fp32_random_image.pt b/tests/data/output/RN101-quickgelu_None_fp32_random_image.pt
diff --git a/tests/data/output/RN101-quickgelu_None_fp32_random_text.pt b/tests/data/output/RN101-quickgelu_None_fp32_random_text.pt
diff --git a/tests/data/output/RN101_None_fp32_random_image.pt b/tests/data/output/RN101_None_fp32_random_image.pt
diff --git a/tests/data/output/RN101_None_fp32_random_text.pt b/tests/data/output/RN101_None_fp32_random_text.pt
diff --git a/tests/data/output/RN50-quickgelu_None_fp32_random_image.pt b/tests/data/output/RN50-quickgelu_None_fp32_random_image.pt
diff --git a/tests/data/output/RN50-quickgelu_None_fp32_random_text.pt b/tests/data/output/RN50-quickgelu_None_fp32_random_text.pt
diff --git a/tests/data/output/RN50_None_fp32_random_image.pt b/tests/data/output/RN50_None_fp32_random_image.pt
diff --git a/tests/data/output/RN50_None_fp32_random_text.pt b/tests/data/output/RN50_None_fp32_random_text.pt
diff --git a/tests/data/output/RN50x16_None_fp32_random_image.pt b/tests/data/output/RN50x16_None_fp32_random_image.pt
diff --git a/tests/data/output/RN50x16_None_fp32_random_text.pt b/tests/data/output/RN50x16_None_fp32_random_text.pt
diff --git a/tests/data/output/RN50x4_None_fp32_random_image.pt b/tests/data/output/RN50x4_None_fp32_random_image.pt
diff --git a/tests/data/output/RN50x4_None_fp32_random_text.pt b/tests/data/output/RN50x4_None_fp32_random_text.pt
diff --git a/tests/data/output/RN50x64_None_fp32_random_image.pt b/tests/data/output/RN50x64_None_fp32_random_image.pt
diff --git a/tests/data/output/RN50x64_None_fp32_random_text.pt b/tests/data/output/RN50x64_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-16-plus-240_None_fp32_random_image.pt b/tests/data/output/ViT-B-16-plus-240_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-16-plus-240_None_fp32_random_text.pt b/tests/data/output/ViT-B-16-plus-240_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-16-plus_None_fp32_random_image.pt b/tests/data/output/ViT-B-16-plus_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-16-plus_None_fp32_random_text.pt b/tests/data/output/ViT-B-16-plus_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-16_None_fp32_random_image.pt b/tests/data/output/ViT-B-16_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-16_None_fp32_random_text.pt b/tests/data/output/ViT-B-16_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-32-plus-256_None_fp32_random_image.pt b/tests/data/output/ViT-B-32-plus-256_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-32-plus-256_None_fp32_random_text.pt b/tests/data/output/ViT-B-32-plus-256_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-32-quickgelu_None_fp32_random_image.pt b/tests/data/output/ViT-B-32-quickgelu_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-32-quickgelu_None_fp32_random_text.pt b/tests/data/output/ViT-B-32-quickgelu_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-B-32_None_fp32_random_image.pt b/tests/data/output/ViT-B-32_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-B-32_None_fp32_random_text.pt b/tests/data/output/ViT-B-32_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-H-14_None_fp32_random_image.pt b/tests/data/output/ViT-H-14_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-H-14_None_fp32_random_text.pt b/tests/data/output/ViT-H-14_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-H-16_None_fp32_random_image.pt b/tests/data/output/ViT-H-16_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-H-16_None_fp32_random_text.pt b/tests/data/output/ViT-H-16_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-L-14-280_None_fp32_random_image.pt b/tests/data/output/ViT-L-14-280_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-L-14-280_None_fp32_random_text.pt b/tests/data/output/ViT-L-14-280_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-L-14-336_None_fp32_random_image.pt b/tests/data/output/ViT-L-14-336_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-L-14-336_None_fp32_random_text.pt b/tests/data/output/ViT-L-14-336_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-L-14_None_fp32_random_image.pt b/tests/data/output/ViT-L-14_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-L-14_None_fp32_random_text.pt b/tests/data/output/ViT-L-14_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-L-16-320_None_fp32_random_image.pt b/tests/data/output/ViT-L-16-320_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-L-16-320_None_fp32_random_text.pt b/tests/data/output/ViT-L-16-320_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-L-16_None_fp32_random_image.pt b/tests/data/output/ViT-L-16_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-L-16_None_fp32_random_text.pt b/tests/data/output/ViT-L-16_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-M-16-alt_None_fp32_random_image.pt b/tests/data/output/ViT-M-16-alt_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-M-16-alt_None_fp32_random_text.pt b/tests/data/output/ViT-M-16-alt_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-M-16_None_fp32_random_image.pt b/tests/data/output/ViT-M-16_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-M-16_None_fp32_random_text.pt b/tests/data/output/ViT-M-16_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-M-32-alt_None_fp32_random_image.pt b/tests/data/output/ViT-M-32-alt_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-M-32-alt_None_fp32_random_text.pt b/tests/data/output/ViT-M-32-alt_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-M-32_None_fp32_random_image.pt b/tests/data/output/ViT-M-32_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-M-32_None_fp32_random_text.pt b/tests/data/output/ViT-M-32_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-S-16-alt_None_fp32_random_image.pt b/tests/data/output/ViT-S-16-alt_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-S-16-alt_None_fp32_random_text.pt b/tests/data/output/ViT-S-16-alt_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-S-16_None_fp32_random_image.pt b/tests/data/output/ViT-S-16_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-S-16_None_fp32_random_text.pt b/tests/data/output/ViT-S-16_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-S-32-alt_None_fp32_random_image.pt b/tests/data/output/ViT-S-32-alt_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-S-32-alt_None_fp32_random_text.pt b/tests/data/output/ViT-S-32-alt_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-S-32_None_fp32_random_image.pt b/tests/data/output/ViT-S-32_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-S-32_None_fp32_random_text.pt b/tests/data/output/ViT-S-32_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-bigG-14_None_fp32_random_image.pt b/tests/data/output/ViT-bigG-14_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-bigG-14_None_fp32_random_text.pt b/tests/data/output/ViT-bigG-14_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-e-14_None_fp32_random_image.pt b/tests/data/output/ViT-e-14_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-e-14_None_fp32_random_text.pt b/tests/data/output/ViT-e-14_None_fp32_random_text.pt
diff --git a/tests/data/output/ViT-g-14_None_fp32_random_image.pt b/tests/data/output/ViT-g-14_None_fp32_random_image.pt
diff --git a/tests/data/output/ViT-g-14_None_fp32_random_text.pt b/tests/data/output/ViT-g-14_None_fp32_random_text.pt
diff --git a/tests/data/output/mt5-base-ViT-B-32_None_fp32_random_image.pt b/tests/data/output/mt5-base-ViT-B-32_None_fp32_random_image.pt
diff --git a/tests/data/output/mt5-base-ViT-B-32_None_fp32_random_text.pt b/tests/data/output/mt5-base-ViT-B-32_None_fp32_random_text.pt
diff --git a/tests/data/output/mt5-xl-ViT-H-14_None_fp32_random_image.pt b/tests/data/output/mt5-xl-ViT-H-14_None_fp32_random_image.pt
diff --git a/tests/data/output/mt5-xl-ViT-H-14_None_fp32_random_text.pt b/tests/data/output/mt5-xl-ViT-H-14_None_fp32_random_text.pt
diff --git a/tests/data/output/roberta-ViT-B-32_None_fp32_random_image.pt b/tests/data/output/roberta-ViT-B-32_None_fp32_random_image.pt
diff --git a/tests/data/output/roberta-ViT-B-32_None_fp32_random_text.pt b/tests/data/output/roberta-ViT-B-32_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-convnext_base_None_fp32_random_image.pt b/tests/data/output/timm-convnext_base_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-convnext_base_None_fp32_random_text.pt b/tests/data/output/timm-convnext_base_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-convnext_large_None_fp32_random_image.pt b/tests/data/output/timm-convnext_large_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-convnext_large_None_fp32_random_text.pt b/tests/data/output/timm-convnext_large_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-efficientnetv2_rw_s_None_fp32_random_image.pt b/tests/data/output/timm-efficientnetv2_rw_s_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-efficientnetv2_rw_s_None_fp32_random_text.pt b/tests/data/output/timm-efficientnetv2_rw_s_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-resnetaa50d_None_fp32_random_image.pt b/tests/data/output/timm-resnetaa50d_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-resnetaa50d_None_fp32_random_text.pt b/tests/data/output/timm-resnetaa50d_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-swin_base_patch4_window7_224_None_fp32_random_image.pt b/tests/data/output/timm-swin_base_patch4_window7_224_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-swin_base_patch4_window7_224_None_fp32_random_text.pt b/tests/data/output/timm-swin_base_patch4_window7_224_None_fp32_random_text.pt
diff --git a/tests/data/output/timm-vit_relpos_medium_patch16_cls_224_None_fp32_random_image.pt b/tests/data/output/timm-vit_relpos_medium_patch16_cls_224_None_fp32_random_image.pt
diff --git a/tests/data/output/timm-vit_relpos_medium_patch16_cls_224_None_fp32_random_text.pt b/tests/data/output/timm-vit_relpos_medium_patch16_cls_224_None_fp32_random_text.pt
diff --git a/tests/data/output/xlm-roberta-base-ViT-B-32_None_fp32_random_image.pt b/tests/data/output/xlm-roberta-base-ViT-B-32_None_fp32_random_image.pt
diff --git a/tests/data/output/xlm-roberta-base-ViT-B-32_None_fp32_random_text.pt b/tests/data/output/xlm-roberta-base-ViT-B-32_None_fp32_random_text.pt
diff --git a/tests/data/output/xlm-roberta-large-ViT-H-14_None_fp32_random_image.pt b/tests/data/output/xlm-roberta-large-ViT-H-14_None_fp32_random_image.pt
diff --git a/tests/data/output/xlm-roberta-large-ViT-H-14_None_fp32_random_text.pt b/tests/data/output/xlm-roberta-large-ViT-H-14_None_fp32_random_text.pt
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -1,17 +1,16 @@
 
 import os
-import random
 import pytest
-import numpy
 import torch
-from PIL import Image
 import open_clip
 import util_test
 
 os.environ['CUDA_VISIBLE_DEVICES'] = ''
 
-# test all model with some exceptions
-models_to_test = set(open_clip.list_models()).difference({
+models_to_test = set(open_clip.list_models())
+
+# testing excemptions
+models_to_test = models_to_test.difference({
         # not available with timm yet
         # see https://github.com/mlfoundations/open_clip/issues/219
         'timm-convnext_xlarge',
@@ -22,15 +21,24 @@
         'mt5-xl-ViT-H-14',
 })
 
+if 'OPEN_CLIP_TEST_REG_MODELS' in os.environ:
+    external_model_list = os.environ['OPEN_CLIP_TEST_REG_MODELS']
+    with open(external_model_list, 'r') as f:
+        models_to_test = set(f.read().splitlines()).intersection(models_to_test)
+    print(f"Selected models from {external_model_list}: {models_to_test}")
+
+models_to_test = list(models_to_test)
+models_to_test.sort()
+
+@pytest.mark.regression_test
 @pytest.mark.parametrize('model_name', models_to_test)
 def test_inference_with_data(
         model_name,
         pretrained = None,
+        pretrained_hf = False,
         precision = 'fp32',
         jit = False,
         force_quick_gelu = False,
-        # experimentally determined between author machine and GH runner
-        tolerance = torch.finfo(torch.float32).resolution * 4
 ):
     util_test.seed_all()
     model, _, preprocess_val = open_clip.create_model_and_transforms(
@@ -39,30 +47,34 @@ def test_inference_with_data(
             precision = precision,
             jit = jit,
             force_quick_gelu = force_quick_gelu,
-            pretrained_hf = False
+            pretrained_hf = pretrained_hf
     )
-    model_id = f'{model_name}_{pretrained}_{precision}'
+    model_id = f'{model_name}_{pretrained or pretrained_hf}_{precision}'
     input_dir, output_dir = util_test.get_data_dirs()
     # text
     input_text_path = os.path.join(input_dir, 'random_text.pt')
     gt_text_path = os.path.join(output_dir, f'{model_id}_random_text.pt')
-    assert os.path.isfile(input_text_path), f"missing test data, expected at {input_text_path}"
-    assert os.path.isfile(gt_text_path), f"missing test data, expected at {gt_text_path}"
+    if not os.path.isfile(input_text_path):
+        pytest.skip(reason = f"missing test data, expected at {input_text_path}")
+    if not os.path.isfile(gt_text_path):
+        pytest.skip(reason = f"missing test data, expected at {gt_text_path}")
     input_text = torch.load(input_text_path)
     gt_text = torch.load(gt_text_path)
     y_text = util_test.inference_text(model, model_name, input_text)
-    assert torch.allclose(y_text, gt_text, atol=tolerance), f"text output differs @ {input_text_path}"
+    assert (y_text == gt_text).all(), f"text output differs @ {input_text_path}"
     # image
     image_size = model.visual.image_size
     if not isinstance(image_size, tuple):
         image_size = (image_size, image_size)
     input_image_path = os.path.join(input_dir, f'random_image_{image_size[0]}_{image_size[1]}.pt')
     gt_image_path = os.path.join(output_dir, f'{model_id}_random_image.pt')
-    assert os.path.isfile(input_image_path), f"missing test data, expected at {input_image_path}"
-    assert os.path.isfile(gt_image_path), f"missing test data, expected at {gt_image_path}"
+    if not os.path.isfile(input_image_path):
+        pytest.skip(reason = f"missing test data, expected at {input_image_path}")
+    if not os.path.isfile(gt_image_path):
+        pytest.skip(reason = f"missing test data, expected at {gt_image_path}")
     input_image = torch.load(input_image_path)
     gt_image = torch.load(gt_image_path)
     y_image = util_test.inference_image(model, preprocess_val, input_image)
-    assert torch.allclose(y_image, gt_image, atol=tolerance), f"image output differs @ {input_image_path}"
+    assert (y_image == gt_image).all(), f"image output differs @ {input_image_path}"