diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ff915e046946..696097fd5473 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: runs-on: group: aws-g6-4xlarge-plus container: - image: diffusers/diffusers-pytorch-compile-cuda + image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host --gpus 0 steps: - name: Checkout diffusers diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index 340d8a19e17a..b73faea231dc 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -41,6 +41,12 @@ jobs: run: | CHANGED_FILES="${{ steps.file_changes.outputs.all }}" for FILE in $CHANGED_FILES; do + # skip anything that isn’t still on disk + if [[ ! -f "$FILE" ]]; then + echo "Skipping removed file $FILE" + continue + fi + if [[ "$FILE" == docker/*Dockerfile ]]; then DOCKER_PATH="${FILE%/Dockerfile}" DOCKER_TAG=$(basename "$DOCKER_PATH") @@ -65,7 +71,7 @@ jobs: image-name: - diffusers-pytorch-cpu - diffusers-pytorch-cuda - - diffusers-pytorch-compile-cuda + - diffusers-pytorch-cuda - diffusers-pytorch-xformers-cuda - diffusers-pytorch-minimum-cuda - diffusers-flax-cpu diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 4f92717df8b7..b4c973711e9d 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -188,7 +188,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-compile-cuda + image: diffusers/diffusers-pytorch-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index abf825eaa7a0..7cab08b44fcd 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -262,7 +262,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-compile-cuda + image: diffusers/diffusers-pytorch-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/.github/workflows/release_tests_fast.yml b/.github/workflows/release_tests_fast.yml index 9d65db2f0dee..a464381ba48a 100644 --- a/.github/workflows/release_tests_fast.yml +++ b/.github/workflows/release_tests_fast.yml @@ -316,7 +316,7 @@ jobs: group: aws-g4dn-2xlarge container: - image: diffusers/diffusers-pytorch-compile-cuda + image: diffusers/diffusers-pytorch-cuda options: --gpus 0 --shm-size "16gb" --ipc host steps: diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile deleted file mode 100644 index cb4a9c0f9896..000000000000 --- a/docker/diffusers-pytorch-compile-cuda/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04 -LABEL maintainer="Hugging Face" -LABEL repository="diffusers" - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get -y update \ - && apt-get install -y software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa - -RUN apt install -y bash \ - build-essential \ - git \ - git-lfs \ - curl \ - ca-certificates \ - libsndfile1-dev \ - libgl1 \ - python3.10 \ - python3.10-dev \ - python3-pip \ - python3.10-venv && \ - rm -rf /var/lib/apt/lists - -# make sure to use venv -RUN python3.10 -m venv /opt/venv -ENV PATH="/opt/venv/bin:$PATH" - -# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py) -RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \ - python3.10 -m uv pip install --no-cache-dir \ - torch \ - torchvision \ - torchaudio \ - invisible_watermark && \ - python3.10 -m pip install --no-cache-dir \ - accelerate \ - datasets \ - hf-doc-builder \ - huggingface-hub \ - hf_transfer \ - Jinja2 \ - librosa \ - numpy==1.26.4 \ - scipy \ - tensorboard \ - transformers \ - hf_transfer - -CMD ["/bin/bash"] diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 0b17d7977a41..8de26212a247 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -1748,14 +1748,14 @@ class TorchCompileTesterMixin: def setUp(self): # clean up the VRAM before each test super().setUp() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test in case of CUDA runtime errors super().tearDown() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) @@ -1764,13 +1764,17 @@ def tearDown(self): @is_torch_compile @slow def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() + torch.compiler.reset() init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**init_dict).to(torch_device) model = torch.compile(model, fullgraph=True) - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): + with ( + torch._inductor.utils.fresh_inductor_cache(), + torch._dynamo.config.patch(error_on_recompile=True), + torch.no_grad(), + ): _ = model(**inputs_dict) _ = model(**inputs_dict) @@ -1798,7 +1802,7 @@ def tearDown(self): # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, # there will be recompilation errors, as torch caches the model when run in the same process. super().tearDown() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) @@ -1915,7 +1919,7 @@ def test_hotswapping_model(self, rank0, rank1): def test_hotswapping_compiled_model_linear(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "to_k", "to_v", "to_out.0"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa @@ -1925,7 +1929,7 @@ def test_hotswapping_compiled_model_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["conv", "conv1", "conv2"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa @@ -1935,7 +1939,7 @@ def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "conv"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa diff --git a/tests/models/transformers/test_models_transformer_hunyuan_video.py b/tests/models/transformers/test_models_transformer_hunyuan_video.py index 0a917352164c..5c83d22ab6aa 100644 --- a/tests/models/transformers/test_models_transformer_hunyuan_video.py +++ b/tests/models/transformers/test_models_transformer_hunyuan_video.py @@ -19,20 +19,16 @@ from diffusers import HunyuanVideoTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, - is_torch_compile, - require_torch_2, - require_torch_gpu, - slow, torch_device, ) -from ..test_modeling_common import ModelTesterMixin +from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin enable_full_determinism() -class HunyuanVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): +class HunyuanVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -96,23 +92,8 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - - -class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): +class HunyuanSkyreelsImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -179,23 +160,8 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - -class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): +class HunyuanVideoImageToVideoTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -260,23 +226,10 @@ def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) - - -class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests(ModelTesterMixin, unittest.TestCase): +class HunyuanVideoTokenReplaceImageToVideoTransformer3DTests( + ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase +): model_class = HunyuanVideoTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -342,18 +295,3 @@ def test_output(self): def test_gradient_checkpointing_is_applied(self): expected_set = {"HunyuanVideoTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) diff --git a/tests/models/transformers/test_models_transformer_wan.py b/tests/models/transformers/test_models_transformer_wan.py index 8270c2ee21b0..4eadb892364a 100644 --- a/tests/models/transformers/test_models_transformer_wan.py +++ b/tests/models/transformers/test_models_transformer_wan.py @@ -19,20 +19,16 @@ from diffusers import WanTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, - is_torch_compile, - require_torch_2, - require_torch_gpu, - slow, torch_device, ) -from ..test_modeling_common import ModelTesterMixin +from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin enable_full_determinism() -class WanTransformer3DTests(ModelTesterMixin, unittest.TestCase): +class WanTransformer3DTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase): model_class = WanTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True @@ -86,18 +82,3 @@ def prepare_init_args_and_inputs_for_common(self): def test_gradient_checkpointing_is_applied(self): expected_set = {"WanTransformer3DModel"} super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @require_torch_gpu - @require_torch_2 - @is_torch_compile - @slow - def test_torch_compile_recompilation_and_graph_break(self): - torch._dynamo.reset() - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict).to(torch_device) - model = torch.compile(model, fullgraph=True) - - with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad(): - _ = model(**inputs_dict) - _ = model(**inputs_dict) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index bb21c9ac8dcb..a2951a8b4673 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -15,7 +15,6 @@ import gc import tempfile -import traceback import unittest import numpy as np @@ -39,13 +38,9 @@ backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, - get_python_version, - is_torch_compile, load_image, load_numpy, - require_torch_2, require_torch_accelerator, - run_test_in_subprocess, slow, torch_device, ) @@ -68,52 +63,6 @@ enable_full_determinism() -# Will be run via run_test_in_subprocess -def _test_stable_diffusion_compile(in_queue, out_queue, timeout): - error = None - try: - _ = in_queue.get(timeout=timeout) - - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.controlnet.to(memory_format=torch.channels_last) - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - - output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" - ) - expected_image = np.resize(expected_image, (512, 512, 3)) - - assert np.abs(expected_image - image).max() < 1.0 - - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class ControlNetPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -1053,15 +1002,6 @@ def test_canny_guess_mode_euler(self): expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - @is_torch_compile - @require_torch_2 - @unittest.skipIf( - get_python_version == (3, 12), - reason="Torch Dynamo isn't yet supported for Python 3.12.", - ) - def test_stable_diffusion_compile(self): - run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) - def test_v11_shuffle_global_pool_conditions(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle") diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 74af4b6775cc..6f8422797cce 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -14,7 +14,6 @@ # limitations under the License. import gc -import traceback import unittest import numpy as np @@ -36,13 +35,9 @@ from diffusers.utils.testing_utils import ( backend_empty_cache, enable_full_determinism, - is_torch_compile, load_image, - load_numpy, require_accelerator, - require_torch_2, require_torch_accelerator, - run_test_in_subprocess, slow, torch_device, ) @@ -78,53 +73,6 @@ def to_np(tensor): return tensor -# Will be run via run_test_in_subprocess -def _test_stable_diffusion_compile(in_queue, out_queue, timeout): - error = None - try: - _ = in_queue.get(timeout=timeout) - - controlnet = ControlNetXSAdapter.from_pretrained( - "UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16 - ) - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base", - controlnet=controlnet, - safety_checker=None, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - - output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" - ) - expected_image = np.resize(expected_image, (512, 512, 3)) - - assert np.abs(expected_image - image).max() < 1.0 - - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class ControlNetXSPipelineFastTests( PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, @@ -402,8 +350,3 @@ def test_depth(self): original_image = image[-3:, -3:, -1].flatten() expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941]) assert np.allclose(original_image, expected_image, atol=1e-04) - - @is_torch_compile - @require_torch_2 - def test_stable_diffusion_compile(self): - run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 3b5c7a24b4ca..2c6739c8ef9f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -17,7 +17,6 @@ import gc import tempfile import time -import traceback import unittest import numpy as np @@ -49,16 +48,12 @@ backend_reset_max_memory_allocated, backend_reset_peak_memory_stats, enable_full_determinism, - is_torch_compile, - load_image, load_numpy, nightly, numpy_cosine_similarity_distance, require_accelerate_version_greater, - require_torch_2, require_torch_accelerator, require_torch_multi_accelerator, - run_test_in_subprocess, skip_mps, slow, torch_device, @@ -81,39 +76,6 @@ enable_full_determinism() -# Will be run via run_test_in_subprocess -def _test_stable_diffusion_compile(in_queue, out_queue, timeout): - error = None - try: - inputs = in_queue.get(timeout=timeout) - torch_device = inputs.pop("torch_device") - seed = inputs.pop("seed") - inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) - - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - - sd_pipe.unet.to(memory_format=torch.channels_last) - sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True) - - sd_pipe.set_progress_bar_config(disable=None) - - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239]) - - assert np.abs(image_slice - expected_slice).max() < 5e-3 - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class StableDiffusionPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -1224,40 +1186,6 @@ def test_stable_diffusion_textual_inversion_with_sequential_cpu_offload(self): max_diff = np.abs(expected_image - image).max() assert max_diff < 8e-1 - @is_torch_compile - @require_torch_2 - def test_stable_diffusion_compile(self): - seed = 0 - inputs = self.get_inputs(torch_device, seed=seed) - # Can't pickle a Generator object - del inputs["generator"] - inputs["torch_device"] = torch_device - inputs["seed"] = seed - run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs) - - def test_stable_diffusion_lcm(self): - unet = UNet2DConditionModel.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", subfolder="unet") - sd_pipe = StableDiffusionPipeline.from_pretrained("Lykon/dreamshaper-7", unet=unet).to(torch_device) - sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 6 - inputs["output_type"] = "pil" - - image = sd_pipe(**inputs).images[0] - - expected_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/lcm_full/stable_diffusion_lcm.png" - ) - - image = sd_pipe.image_processor.pil_to_numpy(image) - expected_image = sd_pipe.image_processor.pil_to_numpy(expected_image) - - max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) - - assert max_diff < 1e-2 - @slow @require_torch_accelerator diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 82b01a74869a..094e98d09ef9 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -15,7 +15,6 @@ import gc import random -import traceback import unittest import numpy as np @@ -41,13 +40,10 @@ backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, - is_torch_compile, load_image, load_numpy, nightly, - require_torch_2, require_torch_accelerator, - run_test_in_subprocess, skip_mps, slow, torch_device, @@ -70,38 +66,6 @@ enable_full_determinism() -# Will be run via run_test_in_subprocess -def _test_img2img_compile(in_queue, out_queue, timeout): - error = None - try: - inputs = in_queue.get(timeout=timeout) - torch_device = inputs.pop("torch_device") - seed = inputs.pop("seed") - inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) - - pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.unet.set_default_attn_processor() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([0.0606, 0.0570, 0.0805, 0.0579, 0.0628, 0.0623, 0.0843, 0.1115, 0.0806]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class StableDiffusionImg2ImgPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -654,17 +618,6 @@ def test_img2img_safety_checker_works(self): assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}" assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros - @is_torch_compile - @require_torch_2 - def test_img2img_compile(self): - seed = 0 - inputs = self.get_inputs(torch_device, seed=seed) - # Can't pickle a Generator object - del inputs["generator"] - inputs["torch_device"] = torch_device - inputs["seed"] = seed - run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs) - @nightly @require_torch_accelerator diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index e028b4017860..8456994d6f81 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -15,7 +15,6 @@ import gc import random -import traceback import unittest import numpy as np @@ -44,13 +43,10 @@ backend_reset_peak_memory_stats, enable_full_determinism, floats_tensor, - is_torch_compile, load_image, load_numpy, nightly, - require_torch_2, require_torch_accelerator, - run_test_in_subprocess, slow, torch_device, ) @@ -71,40 +67,6 @@ enable_full_determinism() -# Will be run via run_test_in_subprocess -def _test_inpaint_compile(in_queue, out_queue, timeout): - error = None - try: - inputs = in_queue.get(timeout=timeout) - torch_device = inputs.pop("torch_device") - seed = inputs.pop("seed") - inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) - - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "botp/stable-diffusion-v1-5-inpainting", safety_checker=None - ) - pipe.unet.set_default_attn_processor() - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0689, 0.0699, 0.0790, 0.0536, 0.0470, 0.0488, 0.041, 0.0508, 0.04179]) - assert np.abs(expected_slice - image_slice).max() < 3e-3 - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class StableDiffusionInpaintPipelineFastTests( IPAdapterTesterMixin, PipelineLatentTesterMixin, @@ -727,17 +689,6 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 - @is_torch_compile - @require_torch_2 - def test_inpaint_compile(self): - seed = 0 - inputs = self.get_inputs(torch_device, seed=seed) - # Can't pickle a Generator object - del inputs["generator"] - inputs["torch_device"] = torch_device - inputs["seed"] = seed - run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs) - def test_stable_diffusion_inpaint_pil_input_resolution_test(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( "botp/stable-diffusion-v1-5-inpainting", safety_checker=None @@ -964,11 +915,6 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): # make sure that less than 2.45 GB is allocated assert mem_bytes < 2.45 * 10**9 - @is_torch_compile - @require_torch_2 - def test_inpaint_compile(self): - pass - def test_stable_diffusion_inpaint_pil_input_resolution_test(self): vae = AsymmetricAutoencoderKL.from_pretrained( "cross-attention/asymmetric-autoencoder-kl-x-1-5", diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index a2241236da20..58b86fda3090 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -2006,7 +2006,9 @@ def test_from_save_pretrained(self): reason="Torch Dynamo isn't yet supported for Python 3.12.", ) def test_from_save_pretrained_dynamo(self): - run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None) + torch.compiler.rest() + with torch._inductor.utils.fresh_inductor_cache(): + run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None) def test_from_pretrained_hub(self): model_path = "google/ddpm-cifar10-32" @@ -2218,7 +2220,7 @@ def tearDown(self): # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, # there will be recompilation errors, as torch caches the model when run in the same process. super().tearDown() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) @@ -2343,21 +2345,21 @@ def test_hotswapping_pipeline(self, rank0, rank1): def test_hotswapping_compiled_pipline_linear(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "to_k", "to_v", "to_out.0"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa def test_hotswapping_compiled_pipline_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["conv", "conv1", "conv2"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) @parameterized.expand([(11, 11), (7, 13), (13, 7)]) # important to test small to large and vice versa def test_hotswapping_compiled_pipline_both_linear_and_conv2d(self, rank0, rank1): # It's important to add this context to raise an error on recompilation target_modules = ["to_q", "conv"] - with torch._dynamo.config.patch(error_on_recompile=True): + with torch._dynamo.config.patch(error_on_recompile=True), torch._inductor.utils.fresh_inductor_cache(): self.check_pipeline_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules) def test_enable_lora_hotswap_called_after_adapter_added_raises(self): diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index af3a832d31a6..2b915b9ebba5 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1111,14 +1111,14 @@ def callback_cfg_params(self) -> frozenset: def setUp(self): # clean up the VRAM before each test super().setUp() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) def tearDown(self): # clean up the VRAM after each test in case of CUDA runtime errors super().tearDown() - torch._dynamo.reset() + torch.compiler.reset() gc.collect() backend_empty_cache(torch_device) diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index b1216a091c8b..dccb1a85008b 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -1,6 +1,5 @@ import gc import random -import traceback import unittest import numpy as np @@ -27,9 +26,7 @@ floats_tensor, load_image, nightly, - require_torch_2, require_torch_accelerator, - run_test_in_subprocess, torch_device, ) from diffusers.utils.torch_utils import randn_tensor @@ -45,38 +42,6 @@ enable_full_determinism() -# Will be run via run_test_in_subprocess -def _test_unidiffuser_compile(in_queue, out_queue, timeout): - error = None - try: - inputs = in_queue.get(timeout=timeout) - torch_device = inputs.pop("torch_device") - seed = inputs.pop("seed") - inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) - - pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") - # pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to(torch_device) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.set_progress_bar_config(disable=None) - - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) - assert np.abs(image_slice - expected_slice).max() < 1e-1 - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - class UniDiffuserPipelineFastTests( PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase ): @@ -690,19 +655,6 @@ def test_unidiffuser_default_img2text_v1(self): expected_text_prefix = "An astronaut" assert text[0][: len(expected_text_prefix)] == expected_text_prefix - @unittest.skip(reason="Skip torch.compile test to speed up the slow test suite.") - @require_torch_2 - def test_unidiffuser_compile(self, seed=0): - inputs = self.get_inputs(torch_device, seed=seed, generate_latents=True) - # Delete prompt and image for joint inference. - del inputs["prompt"] - del inputs["image"] - # Can't pickle a Generator object - del inputs["generator"] - inputs["torch_device"] = torch_device - inputs["seed"] = seed - run_test_in_subprocess(test_case=self, target_func=_test_unidiffuser_compile, inputs=inputs) - @nightly @require_torch_accelerator