From f5846809cb1d79a707bd49fd0d49376dd9b2e244 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 28 May 2025 12:23:31 +0530
Subject: [PATCH 1/4] photodoodel added

---
 src/diffusers/pipelines/__init__.py           |   4 +
 src/diffusers/pipelines/photodoodle/README.md |  66 ++
 .../pipelines/photodoodle/__init__.py         |  39 ++
 .../photodoodle/pipeline_photodoodle.py       | 613 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  15 +
 5 files changed, 737 insertions(+)
 create mode 100644 src/diffusers/pipelines/photodoodle/README.md
 create mode 100644 src/diffusers/pipelines/photodoodle/__init__.py
 create mode 100644 src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b00530a669ea..4e20335b5f55 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -30,6 +30,7 @@
     "ledits_pp": [],
     "marigold": [],
     "pag": [],
+    "photodoodle": [],
     "stable_diffusion": [],
     "stable_diffusion_xl": [],
 }
@@ -53,6 +54,7 @@
     _import_structure["ddpm"] = ["DDPMPipeline"]
     _import_structure["dit"] = ["DiTPipeline"]
     _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
+    _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"])
     _import_structure["pipeline_utils"] = [
         "AudioPipelineOutput",
         "DiffusionPipeline",
@@ -286,6 +288,7 @@
     _import_structure["mochi"] = ["MochiPipeline"]
     _import_structure["musicldm"] = ["MusicLDMPipeline"]
     _import_structure["omnigen"] = ["OmniGenPipeline"]
+    _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"])
     _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"]
     _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
     _import_structure["pia"] = ["PIAPipeline"]
@@ -492,6 +495,7 @@
         from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
         from .dit import DiTPipeline
         from .latent_diffusion import LDMSuperResolutionPipeline
+        from .photodoodle import PhotoDoodlePipeline
         from .pipeline_utils import (
             AudioPipelineOutput,
             DiffusionPipeline,
diff --git a/src/diffusers/pipelines/photodoodle/README.md b/src/diffusers/pipelines/photodoodle/README.md
new file mode 100644
index 000000000000..67fcee3ae97f
--- /dev/null
+++ b/src/diffusers/pipelines/photodoodle/README.md
@@ -0,0 +1,66 @@
+# PhotoDoodle Pipeline
+
+The PhotoDoodle pipeline is designed for image generation with conditional image input. It uses a combination of text and image conditioning to generate high-quality images.
+
+## Model Architecture
+
+The pipeline uses the following components:
+
+1. **Transformer**: A FluxTransformer2DModel for denoising image latents
+2. **VAE**: An AutoencoderKL for encoding/decoding images
+3. **Text Encoders**: 
+   - CLIP text encoder for initial text embedding
+   - T5 encoder for additional text understanding
+4. **Scheduler**: FlowMatchEulerDiscreteScheduler for the diffusion process
+
+## Usage
+
+```python
+from diffusers import PhotoDoodlePipeline
+import torch
+
+pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev")
+pipeline = pipeline.to("cuda")
+# Load initial model weights
+pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors")
+pipeline.fuse_lora()
+pipeline.unload_lora_weights()
+
+pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors")
+
+# Generate image with text prompt and condition image
+prompt = "add a halo and wings for the cat by sksmagiceffects"
+condition_image = load_image("path/to/condition.jpg")  # PIL Image
+output = pipeline(
+    prompt=prompt,
+    condition_image=condition_image,
+    num_inference_steps=28,
+    guidance_scale=3.5
+)
+
+# Save the generated image
+output.images[0].save("generated_image.png")
+```
+
+## Parameters
+
+- `prompt`: Text prompt for image generation
+- `prompt_2`: Optional secondary prompt for T5 encoder
+- `condition_image`: Input image for conditioning
+- `height`: Output image height (default: 512)
+- `width`: Output image width (default: 512)
+- `num_inference_steps`: Number of denoising steps (default: 28)
+- `guidance_scale`: Classifier-free guidance scale (default: 3.5)
+- `num_images_per_prompt`: Number of images to generate per prompt
+- `generator`: Random number generator for reproducibility
+- `output_type`: Output format ("pil", "latent", or "pt")
+
+## Features
+
+- Dual text encoder architecture (CLIP + T5)
+- Image conditioning support
+- Position encoding for better spatial understanding
+- Support for LoRA fine-tuning
+- VAE slicing and tiling for memory efficiency
+- Progress bar during generation
+- Callback support for step-by-step monitoring 
\ No newline at end of file
diff --git a/src/diffusers/pipelines/photodoodle/__init__.py b/src/diffusers/pipelines/photodoodle/__init__.py
new file mode 100644
index 000000000000..d5dac3eb75d9
--- /dev/null
+++ b/src/diffusers/pipelines/photodoodle/__init__.py
@@ -0,0 +1,39 @@
+"""
+PhotoDoodle pipeline for image generation.
+"""
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+_dummy_objects = {}
+_import_structure = {
+    "pipeline_photodoodle": ["PhotoDoodlePipeline"],
+}
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+else:
+    from .pipeline_photodoodle import PhotoDoodlePipeline
+
+if TYPE_CHECKING:
+    from .pipeline_photodoodle import PhotoDoodlePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    ) 
\ No newline at end of file
diff --git a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py
new file mode 100644
index 000000000000..d7b46d9cb7bb
--- /dev/null
+++ b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py
@@ -0,0 +1,613 @@
+"""
+PhotoDoodle pipeline for image generation.
+"""
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import FluxTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..flux.pipeline_output import FluxPipelineOutput
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+def calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+def prepare_latent_image_ids_2(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y coordinate
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x coordinate
+    return latent_image_ids
+
+def position_encoding_clone(batch_size, original_height, original_width, device, dtype):
+    latent_image_ids = prepare_latent_image_ids_2(original_height, original_width, device, dtype)
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+    cond_latent_image_ids = latent_image_ids
+    latent_image_ids = torch.concat([latent_image_ids, cond_latent_image_ids], dim=-2)
+    return latent_image_ids
+
+def retrieve_latents(
+        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+def retrieve_timesteps(
+        scheduler,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+):
+    """Retrieve timesteps from scheduler and handle custom timesteps/sigmas."""
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    PhotoDoodle pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines.
+
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`T5EncoderModel`]):
+            Second frozen text encoder ([t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl)).
+        tokenizer (`CLIPTokenizer`):
+            A tokenizer for the text encoder.
+        tokenizer_2 (`T5TokenizerFast`):
+            A tokenizer for the second text encoder.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+            self,
+            scheduler: FlowMatchEulerDiscreteScheduler,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            text_encoder_2: T5EncoderModel,
+            tokenizer_2: T5TokenizerFast,
+            transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+
+    def _get_t5_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]] = None,
+            num_images_per_prompt: int = 1,
+            max_sequence_length: int = 512,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        _, seq_len, _ = prompt_embeds.shape
+
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def _get_clip_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]],
+            num_images_per_prompt: int = 1,
+            device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+            self,
+            prompt: Union[str, List[str]],
+            prompt_2: Union[str, List[str]],
+            device: Optional[torch.device] = None,
+            num_images_per_prompt: int = 1,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            max_sequence_length: int = 512,
+            lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def prepare_latents(
+            self,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents=None,
+            condition_image=None,
+    ):
+        height = 2 * (int(height) // self.vae_scale_factor)  
+        width = 2 * (int(width) // self.vae_scale_factor)
+
+        shape = (batch_size, num_channels_latents, height, width)  # 1 16 106 80
+
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if condition_image is not None:
+            condition_image = condition_image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=condition_image, generator=generator)
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)  
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+        latents = torch.concat([latents, cond_latents], dim=-2)
+
+        latent_image_ids = position_encoding_clone(batch_size, height, width, device, dtype)  # add position
+
+        mask1 = torch.ones(shape, device=device, dtype=dtype)
+        mask2 = torch.zeros(shape, device=device, dtype=dtype)
+        mask1 = self._pack_latents(mask1, batch_size, num_channels_latents, height, width)  # 1 4096 64
+        mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height, width)  # 1 4096 64
+        mask = torch.concat([mask1, mask2], dim=-2)
+        return latents, latent_image_ids, mask, cond_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 28,
+            timesteps: List[int] = None,
+            guidance_scale: float = 3.5,
+            num_images_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            max_sequence_length: int = 512,
+            condition_image=None,
+    ):
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        
+        condition_image = self.image_processor.preprocess(condition_image, height=height, width=width)
+        condition_image = condition_image.to(dtype=torch.float32)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4  # 16
+        latents, latent_image_ids, mask, cond_latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            condition_image
+        )
+        clean_latents = latents.clone()
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,  # 1 4096 64
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                latents = latents * mask + clean_latents * (1 - mask)
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = self._unpack_latents(latents[:,:latents.shape[-2]-cond_latents.shape[-2],:], height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return FluxPipelineOutput(images=image) 
\ No newline at end of file
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 159d81add355..e69534ac6aa5 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2955,3 +2955,18 @@ def from_config(cls, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
+
+
+class PhotoDoodlePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])

From 1331f56da5c9729b8faa52fe238fc3b616b7908f Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 28 May 2025 12:25:13 +0530
Subject: [PATCH 2/4] photodoodle added

---
 src/diffusers/utils/dummy_torch_and_transformers_objects.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index e69534ac6aa5..922a17ab1621 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2955,6 +2955,7 @@ def from_config(cls, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
+        
 
 
 class PhotoDoodlePipeline(metaclass=DummyObject):

From 5c9814ce093c3b66cfd00d0b128aac6b2119ea19 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 28 May 2025 13:00:11 +0530
Subject: [PATCH 3/4] PhotoDoodle by Ameer

---
 src/diffusers/__init__.py                     |    3 +
 src/diffusers/pipelines/__init__.py           | 1574 ++++++++++-------
 .../photodoodle/pipeline_photodoodle.py       |  178 +-
 3 files changed, 1146 insertions(+), 609 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 8c4ae36c5654..68dd3e69870e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -38,6 +38,7 @@
     "loaders": ["FromOriginalModelMixin"],
     "models": [],
     "pipelines": [],
+    "pipelines.photodoodle": ["PhotoDoodlePipeline"],
     "quantizers.quantization_config": [],
     "schedulers": [],
     "utils": [
@@ -340,6 +341,7 @@
             "AnimateDiffControlNetPipeline",
             "AnimateDiffPAGPipeline",
             "AnimateDiffPipeline",
+            "PhotoDoodlePipeline",
             "AnimateDiffSDXLPipeline",
             "AnimateDiffSparseControlNetPipeline",
             "AnimateDiffVideoToVideoControlNetPipeline",
@@ -927,6 +929,7 @@
             AnimateDiffControlNetPipeline,
             AnimateDiffPAGPipeline,
             AnimateDiffPipeline,
+            PhotoDoodlePipeline,
             AnimateDiffSDXLPipeline,
             AnimateDiffSparseControlNetPipeline,
             AnimateDiffVideoToVideoControlNetPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 4e20335b5f55..68dd3e69870e 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -1,401 +1,596 @@
+__version__ = "0.34.0.dev0"
+
 from typing import TYPE_CHECKING
 
-from ..utils import (
+from .utils import (
     DIFFUSERS_SLOW_IMPORT,
     OptionalDependencyNotAvailable,
     _LazyModule,
-    get_objects_from_module,
+    is_accelerate_available,
+    is_bitsandbytes_available,
     is_flax_available,
+    is_gguf_available,
     is_k_diffusion_available,
     is_librosa_available,
     is_note_seq_available,
     is_onnx_available,
     is_opencv_available,
+    is_optimum_quanto_available,
+    is_scipy_available,
     is_sentencepiece_available,
     is_torch_available,
-    is_torch_npu_available,
+    is_torchao_available,
+    is_torchsde_available,
     is_transformers_available,
 )
 
 
-# These modules contain pipelines from multiple libraries/frameworks
-_dummy_objects = {}
+# Lazy Import based on
+# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
+
+# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names,
+# and is used to defer the actual importing for when the objects are requested.
+# This way `import diffusers` provides the names in the namespace without actually importing anything (and especially none of the backends).
+
 _import_structure = {
-    "controlnet": [],
-    "controlnet_hunyuandit": [],
-    "controlnet_sd3": [],
-    "controlnet_xs": [],
-    "deprecated": [],
-    "latent_diffusion": [],
-    "ledits_pp": [],
-    "marigold": [],
-    "pag": [],
-    "photodoodle": [],
-    "stable_diffusion": [],
-    "stable_diffusion_xl": [],
+    "configuration_utils": ["ConfigMixin"],
+    "hooks": [],
+    "loaders": ["FromOriginalModelMixin"],
+    "models": [],
+    "pipelines": [],
+    "pipelines.photodoodle": ["PhotoDoodlePipeline"],
+    "quantizers.quantization_config": [],
+    "schedulers": [],
+    "utils": [
+        "OptionalDependencyNotAvailable",
+        "is_flax_available",
+        "is_inflect_available",
+        "is_invisible_watermark_available",
+        "is_k_diffusion_available",
+        "is_k_diffusion_version",
+        "is_librosa_available",
+        "is_note_seq_available",
+        "is_onnx_available",
+        "is_scipy_available",
+        "is_torch_available",
+        "is_torchsde_available",
+        "is_transformers_available",
+        "is_transformers_version",
+        "is_unidecode_available",
+        "logging",
+    ],
 }
 
 try:
-    if not is_torch_available():
+    if not is_torch_available() and not is_accelerate_available() and not is_bitsandbytes_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_pt_objects  # noqa F403
+    from .utils import dummy_bitsandbytes_objects
 
-    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+    _import_structure["utils.dummy_bitsandbytes_objects"] = [
+        name for name in dir(dummy_bitsandbytes_objects) if not name.startswith("_")
+    ]
 else:
-    _import_structure["auto_pipeline"] = [
-        "AutoPipelineForImage2Image",
-        "AutoPipelineForInpainting",
-        "AutoPipelineForText2Image",
+    _import_structure["quantizers.quantization_config"].append("BitsAndBytesConfig")
+
+try:
+    if not is_torch_available() and not is_accelerate_available() and not is_gguf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_gguf_objects
+
+    _import_structure["utils.dummy_gguf_objects"] = [
+        name for name in dir(dummy_gguf_objects) if not name.startswith("_")
     ]
-    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
-    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
-    _import_structure["ddim"] = ["DDIMPipeline"]
-    _import_structure["ddpm"] = ["DDPMPipeline"]
-    _import_structure["dit"] = ["DiTPipeline"]
-    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
-    _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"])
-    _import_structure["pipeline_utils"] = [
-        "AudioPipelineOutput",
-        "DiffusionPipeline",
-        "StableDiffusionMixin",
-        "ImagePipelineOutput",
+else:
+    _import_structure["quantizers.quantization_config"].append("GGUFQuantizationConfig")
+
+try:
+    if not is_torch_available() and not is_accelerate_available() and not is_torchao_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torchao_objects
+
+    _import_structure["utils.dummy_torchao_objects"] = [
+        name for name in dir(dummy_torchao_objects) if not name.startswith("_")
     ]
-    _import_structure["deprecated"].extend(
-        [
-            "PNDMPipeline",
-            "LDMPipeline",
-            "RePaintPipeline",
-            "ScoreSdeVePipeline",
-            "KarrasVePipeline",
-        ]
-    )
+else:
+    _import_structure["quantizers.quantization_config"].append("TorchAoConfig")
+
 try:
-    if not (is_torch_available() and is_librosa_available()):
+    if not is_torch_available() and not is_accelerate_available() and not is_optimum_quanto_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
+    from .utils import dummy_optimum_quanto_objects
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+    _import_structure["utils.dummy_optimum_quanto_objects"] = [
+        name for name in dir(dummy_optimum_quanto_objects) if not name.startswith("_")
+    ]
 else:
-    _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"])
+    _import_structure["quantizers.quantization_config"].append("QuantoConfig")
 
 try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+    if not is_onnx_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+    from .utils import dummy_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_onnx_objects"] = [
+        name for name in dir(dummy_onnx_objects) if not name.startswith("_")
+    ]
 
-    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
 else:
-    _import_structure["deprecated"].extend(
-        [
-            "MidiProcessor",
-            "SpectrogramDiffusionPipeline",
-        ]
-    )
+    _import_structure["pipelines"].extend(["OnnxRuntimeModel"])
 
 try:
-    if not (is_torch_available() and is_transformers_available()):
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
+    from .utils import dummy_pt_objects  # noqa F403
+
+    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["deprecated"].extend(
-        [
-            "VQDiffusionPipeline",
-            "AltDiffusionPipeline",
-            "AltDiffusionImg2ImgPipeline",
-            "CycleDiffusionPipeline",
-            "StableDiffusionInpaintPipelineLegacy",
-            "StableDiffusionPix2PixZeroPipeline",
-            "StableDiffusionParadigmsPipeline",
-            "StableDiffusionModelEditingPipeline",
-            "VersatileDiffusionDualGuidedPipeline",
-            "VersatileDiffusionImageVariationPipeline",
-            "VersatileDiffusionPipeline",
-            "VersatileDiffusionTextToImagePipeline",
-        ]
-    )
-    _import_structure["allegro"] = ["AllegroPipeline"]
-    _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
-    _import_structure["animatediff"] = [
-        "AnimateDiffPipeline",
-        "AnimateDiffControlNetPipeline",
-        "AnimateDiffSDXLPipeline",
-        "AnimateDiffSparseControlNetPipeline",
-        "AnimateDiffVideoToVideoPipeline",
-        "AnimateDiffVideoToVideoControlNetPipeline",
-    ]
-    _import_structure["flux"] = [
-        "FluxControlPipeline",
-        "FluxControlInpaintPipeline",
-        "FluxControlImg2ImgPipeline",
-        "FluxControlNetPipeline",
-        "FluxControlNetImg2ImgPipeline",
-        "FluxControlNetInpaintPipeline",
-        "FluxImg2ImgPipeline",
-        "FluxInpaintPipeline",
-        "FluxPipeline",
-        "FluxFillPipeline",
-        "FluxPriorReduxPipeline",
-        "ReduxImageEncoder",
-    ]
-    _import_structure["audioldm"] = ["AudioLDMPipeline"]
-    _import_structure["audioldm2"] = [
-        "AudioLDM2Pipeline",
-        "AudioLDM2ProjectionModel",
-        "AudioLDM2UNet2DConditionModel",
-    ]
-    _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
-    _import_structure["cogvideo"] = [
-        "CogVideoXPipeline",
-        "CogVideoXImageToVideoPipeline",
-        "CogVideoXVideoToVideoPipeline",
-        "CogVideoXFunControlPipeline",
-    ]
-    _import_structure["cogview3"] = ["CogView3PlusPipeline"]
-    _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
-    _import_structure["consisid"] = ["ConsisIDPipeline"]
-    _import_structure["cosmos"] = ["CosmosTextToWorldPipeline", "CosmosVideoToWorldPipeline"]
-    _import_structure["controlnet"].extend(
-        [
-            "BlipDiffusionControlNetPipeline",
-            "StableDiffusionControlNetImg2ImgPipeline",
-            "StableDiffusionControlNetInpaintPipeline",
-            "StableDiffusionControlNetPipeline",
-            "StableDiffusionXLControlNetImg2ImgPipeline",
-            "StableDiffusionXLControlNetInpaintPipeline",
-            "StableDiffusionXLControlNetPipeline",
-            "StableDiffusionXLControlNetUnionPipeline",
-            "StableDiffusionXLControlNetUnionInpaintPipeline",
-            "StableDiffusionXLControlNetUnionImg2ImgPipeline",
-        ]
-    )
-    _import_structure["pag"].extend(
+    _import_structure["hooks"].extend(
         [
-            "StableDiffusionControlNetPAGInpaintPipeline",
-            "AnimateDiffPAGPipeline",
-            "KolorsPAGPipeline",
-            "HunyuanDiTPAGPipeline",
-            "StableDiffusion3PAGPipeline",
-            "StableDiffusion3PAGImg2ImgPipeline",
-            "StableDiffusionPAGPipeline",
-            "StableDiffusionPAGImg2ImgPipeline",
-            "StableDiffusionPAGInpaintPipeline",
-            "StableDiffusionControlNetPAGPipeline",
-            "StableDiffusionXLPAGPipeline",
-            "StableDiffusionXLPAGInpaintPipeline",
-            "StableDiffusionXLControlNetPAGImg2ImgPipeline",
-            "StableDiffusionXLControlNetPAGPipeline",
-            "StableDiffusionXLPAGImg2ImgPipeline",
-            "PixArtSigmaPAGPipeline",
-            "SanaPAGPipeline",
+            "FasterCacheConfig",
+            "HookRegistry",
+            "PyramidAttentionBroadcastConfig",
+            "apply_faster_cache",
+            "apply_pyramid_attention_broadcast",
         ]
     )
-    _import_structure["controlnet_xs"].extend(
+    _import_structure["models"].extend(
         [
-            "StableDiffusionControlNetXSPipeline",
-            "StableDiffusionXLControlNetXSPipeline",
+            "AllegroTransformer3DModel",
+            "AsymmetricAutoencoderKL",
+            "AuraFlowTransformer2DModel",
+            "AutoencoderDC",
+            "AutoencoderKL",
+            "AutoencoderKLAllegro",
+            "AutoencoderKLCogVideoX",
+            "AutoencoderKLCosmos",
+            "AutoencoderKLHunyuanVideo",
+            "AutoencoderKLLTXVideo",
+            "AutoencoderKLMagvit",
+            "AutoencoderKLMochi",
+            "AutoencoderKLTemporalDecoder",
+            "AutoencoderKLWan",
+            "AutoencoderOobleck",
+            "AutoencoderTiny",
+            "AutoModel",
+            "CacheMixin",
+            "CogVideoXTransformer3DModel",
+            "CogView3PlusTransformer2DModel",
+            "CogView4Transformer2DModel",
+            "ConsisIDTransformer3DModel",
+            "ConsistencyDecoderVAE",
+            "ControlNetModel",
+            "ControlNetUnionModel",
+            "ControlNetXSAdapter",
+            "CosmosTransformer3DModel",
+            "DiTTransformer2DModel",
+            "EasyAnimateTransformer3DModel",
+            "FluxControlNetModel",
+            "FluxMultiControlNetModel",
+            "FluxTransformer2DModel",
+            "HiDreamImageTransformer2DModel",
+            "HunyuanDiT2DControlNetModel",
+            "HunyuanDiT2DModel",
+            "HunyuanDiT2DMultiControlNetModel",
+            "HunyuanVideoFramepackTransformer3DModel",
+            "HunyuanVideoTransformer3DModel",
+            "I2VGenXLUNet",
+            "Kandinsky3UNet",
+            "LatteTransformer3DModel",
+            "LTXVideoTransformer3DModel",
+            "Lumina2Transformer2DModel",
+            "LuminaNextDiT2DModel",
+            "MochiTransformer3DModel",
+            "ModelMixin",
+            "MotionAdapter",
+            "MultiAdapter",
+            "MultiControlNetModel",
+            "OmniGenTransformer2DModel",
+            "PixArtTransformer2DModel",
+            "PriorTransformer",
+            "SanaControlNetModel",
+            "SanaTransformer2DModel",
+            "SD3ControlNetModel",
+            "SD3MultiControlNetModel",
+            "SD3Transformer2DModel",
+            "SparseControlNetModel",
+            "StableAudioDiTModel",
+            "StableCascadeUNet",
+            "T2IAdapter",
+            "T5FilmDecoder",
+            "Transformer2DModel",
+            "TransformerTemporalModel",
+            "UNet1DModel",
+            "UNet2DConditionModel",
+            "UNet2DModel",
+            "UNet3DConditionModel",
+            "UNetControlNetXSModel",
+            "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
+            "UVit2DModel",
+            "VQModel",
+            "WanTransformer3DModel",
         ]
     )
-    _import_structure["controlnet_hunyuandit"].extend(
+    _import_structure["optimization"] = [
+        "get_constant_schedule",
+        "get_constant_schedule_with_warmup",
+        "get_cosine_schedule_with_warmup",
+        "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_linear_schedule_with_warmup",
+        "get_polynomial_decay_schedule_with_warmup",
+        "get_scheduler",
+    ]
+    _import_structure["pipelines"].extend(
         [
-            "HunyuanDiTControlNetPipeline",
+            "AudioPipelineOutput",
+            "AutoPipelineForImage2Image",
+            "AutoPipelineForInpainting",
+            "AutoPipelineForText2Image",
+            "ConsistencyModelPipeline",
+            "DanceDiffusionPipeline",
+            "DDIMPipeline",
+            "DDPMPipeline",
+            "DiffusionPipeline",
+            "DiTPipeline",
+            "ImagePipelineOutput",
+            "KarrasVePipeline",
+            "LDMPipeline",
+            "LDMSuperResolutionPipeline",
+            "PNDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "StableDiffusionMixin",
         ]
     )
-    _import_structure["controlnet_sd3"].extend(
+    _import_structure["quantizers"] = ["DiffusersQuantizer"]
+    _import_structure["schedulers"].extend(
         [
-            "StableDiffusion3ControlNetPipeline",
-            "StableDiffusion3ControlNetInpaintingPipeline",
+            "AmusedScheduler",
+            "CMStochasticIterativeScheduler",
+            "CogVideoXDDIMScheduler",
+            "CogVideoXDPMScheduler",
+            "DDIMInverseScheduler",
+            "DDIMParallelScheduler",
+            "DDIMScheduler",
+            "DDPMParallelScheduler",
+            "DDPMScheduler",
+            "DDPMWuerstchenScheduler",
+            "DEISMultistepScheduler",
+            "DPMSolverMultistepInverseScheduler",
+            "DPMSolverMultistepScheduler",
+            "DPMSolverSinglestepScheduler",
+            "EDMDPMSolverMultistepScheduler",
+            "EDMEulerScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "EulerDiscreteScheduler",
+            "FlowMatchEulerDiscreteScheduler",
+            "FlowMatchHeunDiscreteScheduler",
+            "FlowMatchLCMScheduler",
+            "HeunDiscreteScheduler",
+            "IPNDMScheduler",
+            "KarrasVeScheduler",
+            "KDPM2AncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "LCMScheduler",
+            "PNDMScheduler",
+            "RePaintScheduler",
+            "SASolverScheduler",
+            "SchedulerMixin",
+            "SCMScheduler",
+            "ScoreSdeVeScheduler",
+            "TCDScheduler",
+            "UnCLIPScheduler",
+            "UniPCMultistepScheduler",
+            "VQDiffusionScheduler",
         ]
     )
-    _import_structure["deepfloyd_if"] = [
-        "IFImg2ImgPipeline",
-        "IFImg2ImgSuperResolutionPipeline",
-        "IFInpaintingPipeline",
-        "IFInpaintingSuperResolutionPipeline",
-        "IFPipeline",
-        "IFSuperResolutionPipeline",
-    ]
-    _import_structure["easyanimate"] = [
-        "EasyAnimatePipeline",
-        "EasyAnimateInpaintPipeline",
-        "EasyAnimateControlPipeline",
-    ]
-    _import_structure["hidream_image"] = ["HiDreamImagePipeline"]
-    _import_structure["hunyuandit"] = ["HunyuanDiTPipeline"]
-    _import_structure["hunyuan_video"] = [
-        "HunyuanVideoPipeline",
-        "HunyuanSkyreelsImageToVideoPipeline",
-        "HunyuanVideoImageToVideoPipeline",
-        "HunyuanVideoFramepackPipeline",
-    ]
-    _import_structure["kandinsky"] = [
-        "KandinskyCombinedPipeline",
-        "KandinskyImg2ImgCombinedPipeline",
-        "KandinskyImg2ImgPipeline",
-        "KandinskyInpaintCombinedPipeline",
-        "KandinskyInpaintPipeline",
-        "KandinskyPipeline",
-        "KandinskyPriorPipeline",
-    ]
-    _import_structure["kandinsky2_2"] = [
-        "KandinskyV22CombinedPipeline",
-        "KandinskyV22ControlnetImg2ImgPipeline",
-        "KandinskyV22ControlnetPipeline",
-        "KandinskyV22Img2ImgCombinedPipeline",
-        "KandinskyV22Img2ImgPipeline",
-        "KandinskyV22InpaintCombinedPipeline",
-        "KandinskyV22InpaintPipeline",
-        "KandinskyV22Pipeline",
-        "KandinskyV22PriorEmb2EmbPipeline",
-        "KandinskyV22PriorPipeline",
+    _import_structure["training_utils"] = ["EMAModel"]
+
+try:
+    if not (is_torch_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_scipy_objects"] = [
+        name for name in dir(dummy_torch_and_scipy_objects) if not name.startswith("_")
     ]
-    _import_structure["kandinsky3"] = [
-        "Kandinsky3Img2ImgPipeline",
-        "Kandinsky3Pipeline",
+
+else:
+    _import_structure["schedulers"].extend(["LMSDiscreteScheduler"])
+
+try:
+    if not (is_torch_available() and is_torchsde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_torchsde_objects"] = [
+        name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_")
     ]
-    _import_structure["latent_consistency_models"] = [
-        "LatentConsistencyModelImg2ImgPipeline",
-        "LatentConsistencyModelPipeline",
+
+else:
+    _import_structure["schedulers"].extend(["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"])
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
     ]
-    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
-    _import_structure["ledits_pp"].extend(
+
+else:
+    _import_structure["pipelines"].extend(
         [
+            "AllegroPipeline",
+            "AltDiffusionImg2ImgPipeline",
+            "AltDiffusionPipeline",
+            "AmusedImg2ImgPipeline",
+            "AmusedInpaintPipeline",
+            "AmusedPipeline",
+            "AnimateDiffControlNetPipeline",
+            "AnimateDiffPAGPipeline",
+            "AnimateDiffPipeline",
+            "PhotoDoodlePipeline",
+            "AnimateDiffSDXLPipeline",
+            "AnimateDiffSparseControlNetPipeline",
+            "AnimateDiffVideoToVideoControlNetPipeline",
+            "AnimateDiffVideoToVideoPipeline",
+            "AudioLDM2Pipeline",
+            "AudioLDM2ProjectionModel",
+            "AudioLDM2UNet2DConditionModel",
+            "AudioLDMPipeline",
+            "AuraFlowPipeline",
+            "BlipDiffusionControlNetPipeline",
+            "BlipDiffusionPipeline",
+            "CLIPImageProjection",
+            "CogVideoXFunControlPipeline",
+            "CogVideoXImageToVideoPipeline",
+            "CogVideoXPipeline",
+            "CogVideoXVideoToVideoPipeline",
+            "CogView3PlusPipeline",
+            "CogView4ControlPipeline",
+            "CogView4Pipeline",
+            "ConsisIDPipeline",
+            "CosmosTextToWorldPipeline",
+            "CosmosVideoToWorldPipeline",
+            "CycleDiffusionPipeline",
+            "EasyAnimateControlPipeline",
+            "EasyAnimateInpaintPipeline",
+            "EasyAnimatePipeline",
+            "FluxControlImg2ImgPipeline",
+            "FluxControlInpaintPipeline",
+            "FluxControlNetImg2ImgPipeline",
+            "FluxControlNetInpaintPipeline",
+            "FluxControlNetPipeline",
+            "FluxControlPipeline",
+            "FluxFillPipeline",
+            "FluxImg2ImgPipeline",
+            "FluxInpaintPipeline",
+            "FluxPipeline",
+            "FluxPriorReduxPipeline",
+            "HiDreamImagePipeline",
+            "HunyuanDiTControlNetPipeline",
+            "HunyuanDiTPAGPipeline",
+            "HunyuanDiTPipeline",
+            "HunyuanSkyreelsImageToVideoPipeline",
+            "HunyuanVideoFramepackPipeline",
+            "HunyuanVideoImageToVideoPipeline",
+            "HunyuanVideoPipeline",
+            "I2VGenXLPipeline",
+            "IFImg2ImgPipeline",
+            "IFImg2ImgSuperResolutionPipeline",
+            "IFInpaintingPipeline",
+            "IFInpaintingSuperResolutionPipeline",
+            "IFPipeline",
+            "IFSuperResolutionPipeline",
+            "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
+            "KandinskyCombinedPipeline",
+            "KandinskyImg2ImgCombinedPipeline",
+            "KandinskyImg2ImgPipeline",
+            "KandinskyInpaintCombinedPipeline",
+            "KandinskyInpaintPipeline",
+            "KandinskyPipeline",
+            "KandinskyPriorPipeline",
+            "KandinskyV22CombinedPipeline",
+            "KandinskyV22ControlnetImg2ImgPipeline",
+            "KandinskyV22ControlnetPipeline",
+            "KandinskyV22Img2ImgCombinedPipeline",
+            "KandinskyV22Img2ImgPipeline",
+            "KandinskyV22InpaintCombinedPipeline",
+            "KandinskyV22InpaintPipeline",
+            "KandinskyV22Pipeline",
+            "KandinskyV22PriorEmb2EmbPipeline",
+            "KandinskyV22PriorPipeline",
+            "LatentConsistencyModelImg2ImgPipeline",
+            "LatentConsistencyModelPipeline",
+            "LattePipeline",
+            "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
-        ]
-    )
-    _import_structure["latte"] = ["LattePipeline"]
-    _import_structure["ltx"] = [
-        "LTXPipeline",
-        "LTXImageToVideoPipeline",
-        "LTXConditionPipeline",
-        "LTXLatentUpsamplePipeline",
-    ]
-    _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
-    _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
-    _import_structure["marigold"].extend(
-        [
+            "LTXConditionPipeline",
+            "LTXImageToVideoPipeline",
+            "LTXLatentUpsamplePipeline",
+            "LTXPipeline",
+            "Lumina2Pipeline",
+            "Lumina2Text2ImgPipeline",
+            "LuminaPipeline",
+            "LuminaText2ImgPipeline",
             "MarigoldDepthPipeline",
             "MarigoldIntrinsicsPipeline",
             "MarigoldNormalsPipeline",
-        ]
-    )
-    _import_structure["mochi"] = ["MochiPipeline"]
-    _import_structure["musicldm"] = ["MusicLDMPipeline"]
-    _import_structure["omnigen"] = ["OmniGenPipeline"]
-    _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"])
-    _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"]
-    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
-    _import_structure["pia"] = ["PIAPipeline"]
-    _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"]
-    _import_structure["sana"] = [
-        "SanaPipeline",
-        "SanaSprintPipeline",
-        "SanaControlNetPipeline",
-        "SanaSprintImg2ImgPipeline",
-    ]
-    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
-    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
-    _import_structure["stable_audio"] = [
-        "StableAudioProjectionModel",
-        "StableAudioPipeline",
-    ]
-    _import_structure["stable_cascade"] = [
-        "StableCascadeCombinedPipeline",
-        "StableCascadeDecoderPipeline",
-        "StableCascadePriorPipeline",
-    ]
-    _import_structure["stable_diffusion"].extend(
-        [
-            "CLIPImageProjection",
+            "MochiPipeline",
+            "MusicLDMPipeline",
+            "OmniGenPipeline",
+            "PaintByExamplePipeline",
+            "PIAPipeline",
+            "PixArtAlphaPipeline",
+            "PixArtSigmaPAGPipeline",
+            "PixArtSigmaPipeline",
+            "ReduxImageEncoder",
+            "SanaControlNetPipeline",
+            "SanaPAGPipeline",
+            "SanaPipeline",
+            "SanaSprintImg2ImgPipeline",
+            "SanaSprintPipeline",
+            "SemanticStableDiffusionPipeline",
+            "ShapEImg2ImgPipeline",
+            "ShapEPipeline",
+            "StableAudioPipeline",
+            "StableAudioProjectionModel",
+            "StableCascadeCombinedPipeline",
+            "StableCascadeDecoderPipeline",
+            "StableCascadePriorPipeline",
+            "StableDiffusion3ControlNetInpaintingPipeline",
+            "StableDiffusion3ControlNetPipeline",
+            "StableDiffusion3Img2ImgPipeline",
+            "StableDiffusion3InpaintPipeline",
+            "StableDiffusion3PAGImg2ImgPipeline",
+            "StableDiffusion3PAGImg2ImgPipeline",
+            "StableDiffusion3PAGPipeline",
+            "StableDiffusion3Pipeline",
+            "StableDiffusionAdapterPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPAGInpaintPipeline",
+            "StableDiffusionControlNetPAGPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionControlNetXSPipeline",
             "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
             "StableDiffusionImageVariationPipeline",
             "StableDiffusionImg2ImgPipeline",
             "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
             "StableDiffusionInstructPix2PixPipeline",
             "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPAGImg2ImgPipeline",
+            "StableDiffusionPAGInpaintPipeline",
+            "StableDiffusionPAGPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
             "StableDiffusionPipeline",
+            "StableDiffusionPipelineSafe",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
             "StableDiffusionUpscalePipeline",
-            "StableUnCLIPImg2ImgPipeline",
-            "StableUnCLIPPipeline",
-            "StableDiffusionLDM3DPipeline",
-        ]
-    )
-    _import_structure["aura_flow"] = ["AuraFlowPipeline"]
-    _import_structure["stable_diffusion_3"] = [
-        "StableDiffusion3Pipeline",
-        "StableDiffusion3Img2ImgPipeline",
-        "StableDiffusion3InpaintPipeline",
-    ]
-    _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
-    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
-    _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
-    _import_structure["stable_diffusion_gligen"] = [
-        "StableDiffusionGLIGENPipeline",
-        "StableDiffusionGLIGENTextImagePipeline",
-    ]
-    _import_structure["stable_video_diffusion"] = ["StableVideoDiffusionPipeline"]
-    _import_structure["stable_diffusion_xl"].extend(
-        [
+            "StableDiffusionXLAdapterPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPAGImg2ImgPipeline",
+            "StableDiffusionXLControlNetPAGPipeline",
+            "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLControlNetUnionImg2ImgPipeline",
+            "StableDiffusionXLControlNetUnionInpaintPipeline",
+            "StableDiffusionXLControlNetUnionPipeline",
+            "StableDiffusionXLControlNetXSPipeline",
             "StableDiffusionXLImg2ImgPipeline",
             "StableDiffusionXLInpaintPipeline",
             "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPAGImg2ImgPipeline",
+            "StableDiffusionXLPAGInpaintPipeline",
+            "StableDiffusionXLPAGPipeline",
             "StableDiffusionXLPipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableVideoDiffusionPipeline",
+            "TextToVideoSDPipeline",
+            "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
+            "UnCLIPImageVariationPipeline",
+            "UnCLIPPipeline",
+            "UniDiffuserModel",
+            "UniDiffuserPipeline",
+            "UniDiffuserTextDecoder",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+            "VideoToVideoSDPipeline",
+            "VisualClozeGenerationPipeline",
+            "VisualClozePipeline",
+            "VQDiffusionPipeline",
+            "WanImageToVideoPipeline",
+            "WanPipeline",
+            "WanVideoToVideoPipeline",
+            "WuerstchenCombinedPipeline",
+            "WuerstchenDecoderPipeline",
+            "WuerstchenPriorPipeline",
         ]
     )
-    _import_structure["stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
-    _import_structure["stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
-    _import_structure["stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
-    _import_structure["t2i_adapter"] = [
-        "StableDiffusionAdapterPipeline",
-        "StableDiffusionXLAdapterPipeline",
-    ]
-    _import_structure["text_to_video_synthesis"] = [
-        "TextToVideoSDPipeline",
-        "TextToVideoZeroPipeline",
-        "TextToVideoZeroSDXLPipeline",
-        "VideoToVideoSDPipeline",
-    ]
-    _import_structure["i2vgen_xl"] = ["I2VGenXLPipeline"]
-    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
-    _import_structure["unidiffuser"] = [
-        "ImageTextPipelineOutput",
-        "UniDiffuserModel",
-        "UniDiffuserPipeline",
-        "UniDiffuserTextDecoder",
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_opencv_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_opencv_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_opencv_objects) if not name.startswith("_")
     ]
-    _import_structure["wuerstchen"] = [
-        "WuerstchenCombinedPipeline",
-        "WuerstchenDecoderPipeline",
-        "WuerstchenPriorPipeline",
+
+else:
+    _import_structure["pipelines"].extend(["ConsisIDPipeline"])
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
     ]
-    _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"]
+
+else:
+    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"])
+
 try:
-    if not is_onnx_available():
+    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_onnx_objects  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_sentencepiece_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_")
+    ]
 
-    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
 else:
-    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
+    _import_structure["pipelines"].extend(["KolorsImg2ImgPipeline", "KolorsPAGPipeline", "KolorsPipeline"])
+
 try:
     if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+    from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
+    ]
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
 else:
-    _import_structure["stable_diffusion"].extend(
+    _import_structure["pipelines"].extend(
         [
             "OnnxStableDiffusionImg2ImgPipeline",
             "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
             "OnnxStableDiffusionPipeline",
             "OnnxStableDiffusionUpscalePipeline",
             "StableDiffusionOnnxPipeline",
@@ -403,195 +598,362 @@
     )
 
 try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+    if not (is_torch_available() and is_librosa_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_k_diffusion_objects,
-    )
+    from .utils import dummy_torch_and_librosa_objects  # noqa F403
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
-else:
-    _import_structure["stable_diffusion_k_diffusion"] = [
-        "StableDiffusionKDiffusionPipeline",
-        "StableDiffusionXLKDiffusionPipeline",
+    _import_structure["utils.dummy_torch_and_librosa_objects"] = [
+        name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_")
     ]
 
+else:
+    _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
+
 try:
-    if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_sentencepiece_objects,
-    )
+    from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects))
-else:
-    _import_structure["kolors"] = [
-        "KolorsPipeline",
-        "KolorsImg2ImgPipeline",
+    _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
+        name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
     ]
 
-try:
-    if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_opencv_objects,
-    )
 
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_opencv_objects))
 else:
-    _import_structure["consisid"] = ["ConsisIDPipeline"]
+    _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"])
 
 try:
     if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_flax_objects  # noqa F403
+    from .utils import dummy_flax_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_objects"] = [
+        name for name in dir(dummy_flax_objects) if not name.startswith("_")
+    ]
+
 
-    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
 else:
-    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
+    _import_structure["models.controlnets.controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
+    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
+    _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
+    _import_structure["schedulers"].extend(
+        [
+            "FlaxDDIMScheduler",
+            "FlaxDDPMScheduler",
+            "FlaxDPMSolverMultistepScheduler",
+            "FlaxEulerDiscreteScheduler",
+            "FlaxKarrasVeScheduler",
+            "FlaxLMSDiscreteScheduler",
+            "FlaxPNDMScheduler",
+            "FlaxSchedulerMixin",
+            "FlaxScoreSdeVeScheduler",
+        ]
+    )
+
+
 try:
     if not (is_flax_available() and is_transformers_available()):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
+    from .utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_and_transformers_objects"] = [
+        name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_")
+    ]
+
 
-    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
 else:
-    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
-    _import_structure["stable_diffusion"].extend(
+    _import_structure["pipelines"].extend(
         [
+            "FlaxStableDiffusionControlNetPipeline",
             "FlaxStableDiffusionImg2ImgPipeline",
             "FlaxStableDiffusionInpaintPipeline",
             "FlaxStableDiffusionPipeline",
-        ]
-    )
-    _import_structure["stable_diffusion_xl"].extend(
-        [
             "FlaxStableDiffusionXLPipeline",
         ]
     )
 
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_note_seq_objects"] = [
+        name for name in dir(dummy_note_seq_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(["MidiProcessor"])
+
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .configuration_utils import ConfigMixin
+
     try:
-        if not is_torch_available():
+        if not is_bitsandbytes_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_bitsandbytes_objects import *
+    else:
+        from .quantizers.quantization_config import BitsAndBytesConfig
+
+    try:
+        if not is_gguf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_gguf_objects import *
+    else:
+        from .quantizers.quantization_config import GGUFQuantizationConfig
+
+    try:
+        if not is_torchao_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torchao_objects import *
+    else:
+        from .quantizers.quantization_config import TorchAoConfig
+
+    try:
+        if not is_optimum_quanto_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ..utils.dummy_pt_objects import *  # noqa F403
+        from .utils.dummy_optimum_quanto_objects import *
+    else:
+        from .quantizers.quantization_config import QuantoConfig
 
+    try:
+        if not is_onnx_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_onnx_objects import *  # noqa F403
     else:
-        from .auto_pipeline import (
+        from .pipelines import OnnxRuntimeModel
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .hooks import (
+            FasterCacheConfig,
+            HookRegistry,
+            PyramidAttentionBroadcastConfig,
+            apply_faster_cache,
+            apply_pyramid_attention_broadcast,
+        )
+        from .models import (
+            AllegroTransformer3DModel,
+            AsymmetricAutoencoderKL,
+            AuraFlowTransformer2DModel,
+            AutoencoderDC,
+            AutoencoderKL,
+            AutoencoderKLAllegro,
+            AutoencoderKLCogVideoX,
+            AutoencoderKLCosmos,
+            AutoencoderKLHunyuanVideo,
+            AutoencoderKLLTXVideo,
+            AutoencoderKLMagvit,
+            AutoencoderKLMochi,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderKLWan,
+            AutoencoderOobleck,
+            AutoencoderTiny,
+            AutoModel,
+            CacheMixin,
+            CogVideoXTransformer3DModel,
+            CogView3PlusTransformer2DModel,
+            CogView4Transformer2DModel,
+            ConsisIDTransformer3DModel,
+            ConsistencyDecoderVAE,
+            ControlNetModel,
+            ControlNetUnionModel,
+            ControlNetXSAdapter,
+            CosmosTransformer3DModel,
+            DiTTransformer2DModel,
+            EasyAnimateTransformer3DModel,
+            FluxControlNetModel,
+            FluxMultiControlNetModel,
+            FluxTransformer2DModel,
+            HiDreamImageTransformer2DModel,
+            HunyuanDiT2DControlNetModel,
+            HunyuanDiT2DModel,
+            HunyuanDiT2DMultiControlNetModel,
+            HunyuanVideoFramepackTransformer3DModel,
+            HunyuanVideoTransformer3DModel,
+            I2VGenXLUNet,
+            Kandinsky3UNet,
+            LatteTransformer3DModel,
+            LTXVideoTransformer3DModel,
+            Lumina2Transformer2DModel,
+            LuminaNextDiT2DModel,
+            MochiTransformer3DModel,
+            ModelMixin,
+            MotionAdapter,
+            MultiAdapter,
+            MultiControlNetModel,
+            OmniGenTransformer2DModel,
+            PixArtTransformer2DModel,
+            PriorTransformer,
+            SanaControlNetModel,
+            SanaTransformer2DModel,
+            SD3ControlNetModel,
+            SD3MultiControlNetModel,
+            SD3Transformer2DModel,
+            SparseControlNetModel,
+            StableAudioDiTModel,
+            T2IAdapter,
+            T5FilmDecoder,
+            Transformer2DModel,
+            TransformerTemporalModel,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetControlNetXSModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+            VQModel,
+            WanTransformer3DModel,
+        )
+        from .optimization import (
+            get_constant_schedule,
+            get_constant_schedule_with_warmup,
+            get_cosine_schedule_with_warmup,
+            get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_linear_schedule_with_warmup,
+            get_polynomial_decay_schedule_with_warmup,
+            get_scheduler,
+        )
+        from .pipelines import (
+            AudioPipelineOutput,
             AutoPipelineForImage2Image,
             AutoPipelineForInpainting,
             AutoPipelineForText2Image,
-        )
-        from .consistency_models import ConsistencyModelPipeline
-        from .dance_diffusion import DanceDiffusionPipeline
-        from .ddim import DDIMPipeline
-        from .ddpm import DDPMPipeline
-        from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
-        from .dit import DiTPipeline
-        from .latent_diffusion import LDMSuperResolutionPipeline
-        from .photodoodle import PhotoDoodlePipeline
-        from .pipeline_utils import (
-            AudioPipelineOutput,
+            BlipDiffusionControlNetPipeline,
+            BlipDiffusionPipeline,
+            CLIPImageProjection,
+            ConsistencyModelPipeline,
+            DanceDiffusionPipeline,
+            DDIMPipeline,
+            DDPMPipeline,
             DiffusionPipeline,
+            DiTPipeline,
             ImagePipelineOutput,
+            KarrasVePipeline,
+            LDMPipeline,
+            LDMSuperResolutionPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
             StableDiffusionMixin,
         )
+        from .quantizers import DiffusersQuantizer
+        from .schedulers import (
+            AmusedScheduler,
+            CMStochasticIterativeScheduler,
+            CogVideoXDDIMScheduler,
+            CogVideoXDPMScheduler,
+            DDIMInverseScheduler,
+            DDIMParallelScheduler,
+            DDIMScheduler,
+            DDPMParallelScheduler,
+            DDPMScheduler,
+            DDPMWuerstchenScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepInverseScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EDMDPMSolverMultistepScheduler,
+            EDMEulerScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            FlowMatchEulerDiscreteScheduler,
+            FlowMatchHeunDiscreteScheduler,
+            FlowMatchLCMScheduler,
+            HeunDiscreteScheduler,
+            IPNDMScheduler,
+            KarrasVeScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            LCMScheduler,
+            PNDMScheduler,
+            RePaintScheduler,
+            SASolverScheduler,
+            SchedulerMixin,
+            SCMScheduler,
+            ScoreSdeVeScheduler,
+            TCDScheduler,
+            UnCLIPScheduler,
+            UniPCMultistepScheduler,
+            VQDiffusionScheduler,
+        )
+        from .training_utils import EMAModel
 
     try:
-        if not (is_torch_available() and is_librosa_available()):
+        if not (is_torch_available() and is_scipy_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ..utils.dummy_torch_and_librosa_objects import *
+        from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
     else:
-        from .deprecated import AudioDiffusionPipeline, Mel
+        from .schedulers import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .schedulers import CosineDPMSolverMultistepScheduler, DPMSolverSDEScheduler
 
     try:
         if not (is_torch_available() and is_transformers_available()):
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from ..utils.dummy_torch_and_transformers_objects import *
+        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .allegro import AllegroPipeline
-        from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
-        from .animatediff import (
+        from .pipelines import (
+            AllegroPipeline,
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            AmusedImg2ImgPipeline,
+            AmusedInpaintPipeline,
+            AmusedPipeline,
             AnimateDiffControlNetPipeline,
+            AnimateDiffPAGPipeline,
             AnimateDiffPipeline,
+            PhotoDoodlePipeline,
             AnimateDiffSDXLPipeline,
             AnimateDiffSparseControlNetPipeline,
             AnimateDiffVideoToVideoControlNetPipeline,
             AnimateDiffVideoToVideoPipeline,
-        )
-        from .audioldm import AudioLDMPipeline
-        from .audioldm2 import (
             AudioLDM2Pipeline,
             AudioLDM2ProjectionModel,
             AudioLDM2UNet2DConditionModel,
-        )
-        from .aura_flow import AuraFlowPipeline
-        from .blip_diffusion import BlipDiffusionPipeline
-        from .cogvideo import (
+            AudioLDMPipeline,
+            AuraFlowPipeline,
+            CLIPImageProjection,
             CogVideoXFunControlPipeline,
             CogVideoXImageToVideoPipeline,
             CogVideoXPipeline,
             CogVideoXVideoToVideoPipeline,
-        )
-        from .cogview3 import CogView3PlusPipeline
-        from .cogview4 import CogView4ControlPipeline, CogView4Pipeline
-        from .controlnet import (
-            BlipDiffusionControlNetPipeline,
-            StableDiffusionControlNetImg2ImgPipeline,
-            StableDiffusionControlNetInpaintPipeline,
-            StableDiffusionControlNetPipeline,
-            StableDiffusionXLControlNetImg2ImgPipeline,
-            StableDiffusionXLControlNetInpaintPipeline,
-            StableDiffusionXLControlNetPipeline,
-            StableDiffusionXLControlNetUnionImg2ImgPipeline,
-            StableDiffusionXLControlNetUnionInpaintPipeline,
-            StableDiffusionXLControlNetUnionPipeline,
-        )
-        from .controlnet_hunyuandit import (
-            HunyuanDiTControlNetPipeline,
-        )
-        from .controlnet_sd3 import StableDiffusion3ControlNetInpaintingPipeline, StableDiffusion3ControlNetPipeline
-        from .controlnet_xs import (
-            StableDiffusionControlNetXSPipeline,
-            StableDiffusionXLControlNetXSPipeline,
-        )
-        from .cosmos import CosmosTextToWorldPipeline, CosmosVideoToWorldPipeline
-        from .deepfloyd_if import (
-            IFImg2ImgPipeline,
-            IFImg2ImgSuperResolutionPipeline,
-            IFInpaintingPipeline,
-            IFInpaintingSuperResolutionPipeline,
-            IFPipeline,
-            IFSuperResolutionPipeline,
-        )
-        from .deprecated import (
-            AltDiffusionImg2ImgPipeline,
-            AltDiffusionPipeline,
+            CogView3PlusPipeline,
+            CogView4ControlPipeline,
+            CogView4Pipeline,
+            ConsisIDPipeline,
+            CosmosTextToWorldPipeline,
+            CosmosVideoToWorldPipeline,
             CycleDiffusionPipeline,
-            StableDiffusionInpaintPipelineLegacy,
-            StableDiffusionModelEditingPipeline,
-            StableDiffusionParadigmsPipeline,
-            StableDiffusionPix2PixZeroPipeline,
-            VersatileDiffusionDualGuidedPipeline,
-            VersatileDiffusionImageVariationPipeline,
-            VersatileDiffusionPipeline,
-            VersatileDiffusionTextToImagePipeline,
-            VQDiffusionPipeline,
-        )
-        from .easyanimate import (
             EasyAnimateControlPipeline,
             EasyAnimateInpaintPipeline,
             EasyAnimatePipeline,
-        )
-        from .flux import (
             FluxControlImg2ImgPipeline,
             FluxControlInpaintPipeline,
             FluxControlNetImg2ImgPipeline,
@@ -603,18 +965,24 @@
             FluxInpaintPipeline,
             FluxPipeline,
             FluxPriorReduxPipeline,
-            ReduxImageEncoder,
-        )
-        from .hidream_image import HiDreamImagePipeline
-        from .hunyuan_video import (
+            HiDreamImagePipeline,
+            HunyuanDiTControlNetPipeline,
+            HunyuanDiTPAGPipeline,
+            HunyuanDiTPipeline,
             HunyuanSkyreelsImageToVideoPipeline,
             HunyuanVideoFramepackPipeline,
             HunyuanVideoImageToVideoPipeline,
             HunyuanVideoPipeline,
-        )
-        from .hunyuandit import HunyuanDiTPipeline
-        from .i2vgen_xl import I2VGenXLPipeline
-        from .kandinsky import (
+            I2VGenXLPipeline,
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+            ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
             KandinskyCombinedPipeline,
             KandinskyImg2ImgCombinedPipeline,
             KandinskyImg2ImgPipeline,
@@ -622,8 +990,6 @@
             KandinskyInpaintPipeline,
             KandinskyPipeline,
             KandinskyPriorPipeline,
-        )
-        from .kandinsky2_2 import (
             KandinskyV22CombinedPipeline,
             KandinskyV22ControlnetImg2ImgPipeline,
             KandinskyV22ControlnetPipeline,
@@ -634,210 +1000,225 @@
             KandinskyV22Pipeline,
             KandinskyV22PriorEmb2EmbPipeline,
             KandinskyV22PriorPipeline,
-        )
-        from .kandinsky3 import (
-            Kandinsky3Img2ImgPipeline,
-            Kandinsky3Pipeline,
-        )
-        from .latent_consistency_models import (
             LatentConsistencyModelImg2ImgPipeline,
             LatentConsistencyModelPipeline,
-        )
-        from .latent_diffusion import LDMTextToImagePipeline
-        from .latte import LattePipeline
-        from .ledits_pp import (
-            LEditsPPDiffusionPipelineOutput,
-            LEditsPPInversionPipelineOutput,
+            LattePipeline,
+            LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
-        )
-        from .ltx import LTXConditionPipeline, LTXImageToVideoPipeline, LTXLatentUpsamplePipeline, LTXPipeline
-        from .lumina import LuminaPipeline, LuminaText2ImgPipeline
-        from .lumina2 import Lumina2Pipeline, Lumina2Text2ImgPipeline
-        from .marigold import (
+            LTXConditionPipeline,
+            LTXImageToVideoPipeline,
+            LTXLatentUpsamplePipeline,
+            LTXPipeline,
+            Lumina2Pipeline,
+            Lumina2Text2ImgPipeline,
+            LuminaPipeline,
+            LuminaText2ImgPipeline,
             MarigoldDepthPipeline,
             MarigoldIntrinsicsPipeline,
             MarigoldNormalsPipeline,
-        )
-        from .mochi import MochiPipeline
-        from .musicldm import MusicLDMPipeline
-        from .omnigen import OmniGenPipeline
-        from .pag import (
-            AnimateDiffPAGPipeline,
-            HunyuanDiTPAGPipeline,
-            KolorsPAGPipeline,
+            MochiPipeline,
+            MusicLDMPipeline,
+            OmniGenPipeline,
+            PaintByExamplePipeline,
+            PIAPipeline,
+            PixArtAlphaPipeline,
             PixArtSigmaPAGPipeline,
+            PixArtSigmaPipeline,
+            ReduxImageEncoder,
+            SanaControlNetPipeline,
             SanaPAGPipeline,
+            SanaPipeline,
+            SanaSprintImg2ImgPipeline,
+            SanaSprintPipeline,
+            SemanticStableDiffusionPipeline,
+            ShapEImg2ImgPipeline,
+            ShapEPipeline,
+            StableAudioPipeline,
+            StableAudioProjectionModel,
+            StableCascadeCombinedPipeline,
+            StableCascadeDecoderPipeline,
+            StableCascadePriorPipeline,
+            StableDiffusion3ControlNetInpaintingPipeline,
+            StableDiffusion3ControlNetPipeline,
+            StableDiffusion3Img2ImgPipeline,
+            StableDiffusion3InpaintPipeline,
             StableDiffusion3PAGImg2ImgPipeline,
             StableDiffusion3PAGPipeline,
+            StableDiffusion3Pipeline,
+            StableDiffusionAdapterPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
             StableDiffusionControlNetPAGInpaintPipeline,
             StableDiffusionControlNetPAGPipeline,
-            StableDiffusionPAGImg2ImgPipeline,
-            StableDiffusionPAGInpaintPipeline,
-            StableDiffusionPAGPipeline,
-            StableDiffusionXLControlNetPAGImg2ImgPipeline,
-            StableDiffusionXLControlNetPAGPipeline,
-            StableDiffusionXLPAGImg2ImgPipeline,
-            StableDiffusionXLPAGInpaintPipeline,
-            StableDiffusionXLPAGPipeline,
-        )
-        from .paint_by_example import PaintByExamplePipeline
-        from .pia import PIAPipeline
-        from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
-        from .sana import SanaControlNetPipeline, SanaPipeline, SanaSprintImg2ImgPipeline, SanaSprintPipeline
-        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
-        from .stable_audio import StableAudioPipeline, StableAudioProjectionModel
-        from .stable_cascade import (
-            StableCascadeCombinedPipeline,
-            StableCascadeDecoderPipeline,
-            StableCascadePriorPipeline,
-        )
-        from .stable_diffusion import (
-            CLIPImageProjection,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionControlNetXSPipeline,
             StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
             StableDiffusionImageVariationPipeline,
             StableDiffusionImg2ImgPipeline,
             StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
             StableDiffusionInstructPix2PixPipeline,
             StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPAGImg2ImgPipeline,
+            StableDiffusionPAGInpaintPipeline,
+            StableDiffusionPAGPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
             StableDiffusionPipeline,
+            StableDiffusionPipelineSafe,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
             StableDiffusionUpscalePipeline,
-            StableUnCLIPImg2ImgPipeline,
-            StableUnCLIPPipeline,
-        )
-        from .stable_diffusion_3 import (
-            StableDiffusion3Img2ImgPipeline,
-            StableDiffusion3InpaintPipeline,
-            StableDiffusion3Pipeline,
-        )
-        from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
-        from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
-        from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
-        from .stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
-        from .stable_diffusion_panorama import StableDiffusionPanoramaPipeline
-        from .stable_diffusion_safe import StableDiffusionPipelineSafe
-        from .stable_diffusion_sag import StableDiffusionSAGPipeline
-        from .stable_diffusion_xl import (
+            StableDiffusionXLAdapterPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPAGImg2ImgPipeline,
+            StableDiffusionXLControlNetPAGPipeline,
+            StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLControlNetUnionImg2ImgPipeline,
+            StableDiffusionXLControlNetUnionInpaintPipeline,
+            StableDiffusionXLControlNetUnionPipeline,
+            StableDiffusionXLControlNetXSPipeline,
             StableDiffusionXLImg2ImgPipeline,
             StableDiffusionXLInpaintPipeline,
             StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPAGImg2ImgPipeline,
+            StableDiffusionXLPAGInpaintPipeline,
+            StableDiffusionXLPAGPipeline,
             StableDiffusionXLPipeline,
-        )
-        from .stable_video_diffusion import StableVideoDiffusionPipeline
-        from .t2i_adapter import (
-            StableDiffusionAdapterPipeline,
-            StableDiffusionXLAdapterPipeline,
-        )
-        from .text_to_video_synthesis import (
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
             TextToVideoSDPipeline,
             TextToVideoZeroPipeline,
             TextToVideoZeroSDXLPipeline,
-            VideoToVideoSDPipeline,
-        )
-        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-        from .unidiffuser import (
-            ImageTextPipelineOutput,
+            UnCLIPImageVariationPipeline,
+            UnCLIPPipeline,
             UniDiffuserModel,
             UniDiffuserPipeline,
             UniDiffuserTextDecoder,
-        )
-        from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
-        from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline
-        from .wuerstchen import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VideoToVideoSDPipeline,
+            VisualClozeGenerationPipeline,
+            VisualClozePipeline,
+            VQDiffusionPipeline,
+            WanImageToVideoPipeline,
+            WanPipeline,
+            WanVideoToVideoPipeline,
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
 
-        try:
-            if not is_onnx_available():
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_onnx_objects import *  # noqa F403
-
-        else:
-            from .onnx_utils import OnnxRuntimeModel
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
-        else:
-            from .stable_diffusion import (
-                OnnxStableDiffusionImg2ImgPipeline,
-                OnnxStableDiffusionInpaintPipeline,
-                OnnxStableDiffusionPipeline,
-                OnnxStableDiffusionUpscalePipeline,
-                StableDiffusionOnnxPipeline,
-            )
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
-        else:
-            from .stable_diffusion_k_diffusion import (
-                StableDiffusionKDiffusionPipeline,
-                StableDiffusionXLKDiffusionPipeline,
-            )
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_sentencepiece_objects import *
-        else:
-            from .kolors import (
-                KolorsImg2ImgPipeline,
-                KolorsPipeline,
-            )
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_opencv_objects import *
-        else:
-            from .consisid import ConsisIDPipeline
-
-        try:
-            if not is_flax_available():
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_flax_objects import *  # noqa F403
-        else:
-            from .pipeline_flax_utils import FlaxDiffusionPipeline
-
-        try:
-            if not (is_flax_available() and is_transformers_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_flax_and_transformers_objects import *
-        else:
-            from .controlnet import FlaxStableDiffusionControlNetPipeline
-            from .stable_diffusion import (
-                FlaxStableDiffusionImg2ImgPipeline,
-                FlaxStableDiffusionInpaintPipeline,
-                FlaxStableDiffusionPipeline,
-            )
-            from .stable_diffusion_xl import (
-                FlaxStableDiffusionXLPipeline,
-            )
-
-        try:
-            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
-
-        else:
-            from .deprecated import (
-                MidiProcessor,
-                SpectrogramDiffusionPipeline,
-            )
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    else:
+        from .pipelines import StableDiffusionKDiffusionPipeline, StableDiffusionXLKDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_sentencepiece_objects import *  # noqa F403
+    else:
+        from .pipelines import KolorsImg2ImgPipeline, KolorsPAGPipeline, KolorsPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_opencv_objects import *  # noqa F403
+    else:
+        from .pipelines import ConsisIDPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            OnnxStableDiffusionImg2ImgPipeline,
+            OnnxStableDiffusionInpaintPipeline,
+            OnnxStableDiffusionInpaintPipelineLegacy,
+            OnnxStableDiffusionPipeline,
+            OnnxStableDiffusionUpscalePipeline,
+            StableDiffusionOnnxPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    else:
+        from .pipelines import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import SpectrogramDiffusionPipeline
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .models.controlnets.controlnet_flax import FlaxControlNetModel
+        from .models.modeling_flax_utils import FlaxModelMixin
+        from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .models.vae_flax import FlaxAutoencoderKL
+        from .pipelines import FlaxDiffusionPipeline
+        from .schedulers import (
+            FlaxDDIMScheduler,
+            FlaxDDPMScheduler,
+            FlaxDPMSolverMultistepScheduler,
+            FlaxEulerDiscreteScheduler,
+            FlaxKarrasVeScheduler,
+            FlaxLMSDiscreteScheduler,
+            FlaxPNDMScheduler,
+            FlaxSchedulerMixin,
+            FlaxScoreSdeVeScheduler,
+        )
+
+    try:
+        if not (is_flax_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FlaxStableDiffusionControlNetPipeline,
+            FlaxStableDiffusionImg2ImgPipeline,
+            FlaxStableDiffusionInpaintPipeline,
+            FlaxStableDiffusionPipeline,
+            FlaxStableDiffusionXLPipeline,
+        )
+
+    try:
+        if not (is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import MidiProcessor
 
 else:
     import sys
@@ -847,6 +1228,5 @@
         globals()["__file__"],
         _import_structure,
         module_spec=__spec__,
+        extra_objects={"__version__": __version__},
     )
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py
index d7b46d9cb7bb..47ce0f34fd69 100644
--- a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py
+++ b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py
@@ -33,6 +33,36 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import PhotoDoodlePipeline
+        >>> import torch
+
+        >>> pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev",torch_dtype=torch.bfloat16))
+        >>> pipeline = pipeline.to("cuda")
+        >>> # Load initial model weights
+        >>> pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors")
+        >>> pipeline.fuse_lora()
+        >>> pipeline.unload_lora_weights()
+
+        >>> pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors")
+
+        >>> # Generate image with text prompt and condition image
+        >>> prompt = "add a halo and wings for the cat by sksmagiceffects"
+        >>> condition_image = load_image("./sample.png")  # PIL Image
+        >>> height=768
+        >>> width=512  
+        >>> # Prepare the input image
+        >>> condition_image = condition_image.resize((height, width)).convert("RGB")
+        >>> output = pipeline(prompt=prompt, condition_image=condition_image,num_inference_steps=28,guidance_scale=3.5)
+        >>> # Save the generated image
+        >>> output.images[0].save("photodoodle_results.png")
+        ```
+            
+"""
+
+
 def calculate_shift(
         image_seq_len,
         base_seq_len: int = 256,
@@ -47,8 +77,8 @@ def calculate_shift(
 
 def prepare_latent_image_ids_2(height, width, device, dtype):
     latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
-    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y coordinate
-    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x coordinate
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y坐标
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x坐标
     return latent_image_ids
 
 def position_encoding_clone(batch_size, original_height, original_width, device, dtype):
@@ -61,6 +91,7 @@ def position_encoding_clone(batch_size, original_height, original_width, device,
     latent_image_ids = torch.concat([latent_image_ids, cond_latent_image_ids], dim=-2)
     return latent_image_ids
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
         encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
 ):
@@ -73,6 +104,8 @@ def retrieve_latents(
     else:
         raise AttributeError("Could not access latents of provided encoder_output")
 
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
         scheduler,
         num_inference_steps: Optional[int] = None,
@@ -81,7 +114,29 @@ def retrieve_timesteps(
         sigmas: Optional[List[float]] = None,
         **kwargs,
 ):
-    """Retrieve timesteps from scheduler and handle custom timesteps/sigmas."""
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
     if timesteps is not None:
@@ -109,12 +164,12 @@ def retrieve_timesteps(
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
 
+
 class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
     r"""
-    PhotoDoodle pipeline for image generation.
+    The Flux pipeline for text-to-image generation.
 
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
 
     Args:
         transformer ([`FluxTransformer2DModel`]):
@@ -124,13 +179,17 @@ class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
         text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
         text_encoder_2 ([`T5EncoderModel`]):
-            Second frozen text encoder ([t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl)).
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
         tokenizer (`CLIPTokenizer`):
-            A tokenizer for the text encoder.
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
         tokenizer_2 (`T5TokenizerFast`):
-            A tokenizer for the second text encoder.
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
     """
 
     model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
@@ -266,7 +325,6 @@ def encode_prompt(
             lora_scale: Optional[float] = None,
     ):
         r"""
-        Encodes the prompt into text encoder hidden states.
 
         Args:
             prompt (`str` or `List[str]`, *optional*):
@@ -334,6 +392,101 @@ def encode_prompt(
 
         return prompt_embeds, pooled_prompt_embeds, text_ids
 
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    def check_inputs(
+            self,
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            callback_on_step_end_tensor_inputs=None,
+            max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+                k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+
+        return latents
+
     def enable_vae_slicing(self):
         r"""
         Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
@@ -403,6 +556,7 @@ def prepare_latents(
             else:
                 image_latents = torch.cat([image_latents], dim=0)
 
+        # import pdb; pdb.set_trace()
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)  
         latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
         cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
@@ -610,4 +764,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return FluxPipelineOutput(images=image) 
\ No newline at end of file
+        return FluxPipelineOutput(images=image)
\ No newline at end of file

From 442ad13d1853eabb3220707a2eafd648a30ac922 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Wed, 28 May 2025 13:04:15 +0530
Subject: [PATCH 4/4] PhotoDoodle by Ameer

---
 src/diffusers/pipelines/photodoodle/README.md | 66 -------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 src/diffusers/pipelines/photodoodle/README.md

diff --git a/src/diffusers/pipelines/photodoodle/README.md b/src/diffusers/pipelines/photodoodle/README.md
deleted file mode 100644
index 67fcee3ae97f..000000000000
--- a/src/diffusers/pipelines/photodoodle/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# PhotoDoodle Pipeline
-
-The PhotoDoodle pipeline is designed for image generation with conditional image input. It uses a combination of text and image conditioning to generate high-quality images.
-
-## Model Architecture
-
-The pipeline uses the following components:
-
-1. **Transformer**: A FluxTransformer2DModel for denoising image latents
-2. **VAE**: An AutoencoderKL for encoding/decoding images
-3. **Text Encoders**: 
-   - CLIP text encoder for initial text embedding
-   - T5 encoder for additional text understanding
-4. **Scheduler**: FlowMatchEulerDiscreteScheduler for the diffusion process
-
-## Usage
-
-```python
-from diffusers import PhotoDoodlePipeline
-import torch
-
-pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev")
-pipeline = pipeline.to("cuda")
-# Load initial model weights
-pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors")
-pipeline.fuse_lora()
-pipeline.unload_lora_weights()
-
-pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors")
-
-# Generate image with text prompt and condition image
-prompt = "add a halo and wings for the cat by sksmagiceffects"
-condition_image = load_image("path/to/condition.jpg")  # PIL Image
-output = pipeline(
-    prompt=prompt,
-    condition_image=condition_image,
-    num_inference_steps=28,
-    guidance_scale=3.5
-)
-
-# Save the generated image
-output.images[0].save("generated_image.png")
-```
-
-## Parameters
-
-- `prompt`: Text prompt for image generation
-- `prompt_2`: Optional secondary prompt for T5 encoder
-- `condition_image`: Input image for conditioning
-- `height`: Output image height (default: 512)
-- `width`: Output image width (default: 512)
-- `num_inference_steps`: Number of denoising steps (default: 28)
-- `guidance_scale`: Classifier-free guidance scale (default: 3.5)
-- `num_images_per_prompt`: Number of images to generate per prompt
-- `generator`: Random number generator for reproducibility
-- `output_type`: Output format ("pil", "latent", or "pt")
-
-## Features
-
-- Dual text encoder architecture (CLIP + T5)
-- Image conditioning support
-- Position encoding for better spatial understanding
-- Support for LoRA fine-tuning
-- VAE slicing and tiling for memory efficiency
-- Progress bar during generation
-- Callback support for step-by-step monitoring 
\ No newline at end of file