From f5846809cb1d79a707bd49fd0d49376dd9b2e244 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 28 May 2025 12:23:31 +0530 Subject: [PATCH 1/4] photodoodel added --- src/diffusers/pipelines/__init__.py | 4 + src/diffusers/pipelines/photodoodle/README.md | 66 ++ .../pipelines/photodoodle/__init__.py | 39 ++ .../photodoodle/pipeline_photodoodle.py | 613 ++++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + 5 files changed, 737 insertions(+) create mode 100644 src/diffusers/pipelines/photodoodle/README.md create mode 100644 src/diffusers/pipelines/photodoodle/__init__.py create mode 100644 src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index b00530a669ea..4e20335b5f55 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -30,6 +30,7 @@ "ledits_pp": [], "marigold": [], "pag": [], + "photodoodle": [], "stable_diffusion": [], "stable_diffusion_xl": [], } @@ -53,6 +54,7 @@ _import_structure["ddpm"] = ["DDPMPipeline"] _import_structure["dit"] = ["DiTPipeline"] _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"]) + _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"]) _import_structure["pipeline_utils"] = [ "AudioPipelineOutput", "DiffusionPipeline", @@ -286,6 +288,7 @@ _import_structure["mochi"] = ["MochiPipeline"] _import_structure["musicldm"] = ["MusicLDMPipeline"] _import_structure["omnigen"] = ["OmniGenPipeline"] + _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"]) _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"] _import_structure["paint_by_example"] = ["PaintByExamplePipeline"] _import_structure["pia"] = ["PIAPipeline"] @@ -492,6 +495,7 @@ from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline from .dit import DiTPipeline from .latent_diffusion import LDMSuperResolutionPipeline + from .photodoodle import PhotoDoodlePipeline from .pipeline_utils import ( AudioPipelineOutput, DiffusionPipeline, diff --git a/src/diffusers/pipelines/photodoodle/README.md b/src/diffusers/pipelines/photodoodle/README.md new file mode 100644 index 000000000000..67fcee3ae97f --- /dev/null +++ b/src/diffusers/pipelines/photodoodle/README.md @@ -0,0 +1,66 @@ +# PhotoDoodle Pipeline + +The PhotoDoodle pipeline is designed for image generation with conditional image input. It uses a combination of text and image conditioning to generate high-quality images. + +## Model Architecture + +The pipeline uses the following components: + +1. **Transformer**: A FluxTransformer2DModel for denoising image latents +2. **VAE**: An AutoencoderKL for encoding/decoding images +3. **Text Encoders**: + - CLIP text encoder for initial text embedding + - T5 encoder for additional text understanding +4. **Scheduler**: FlowMatchEulerDiscreteScheduler for the diffusion process + +## Usage + +```python +from diffusers import PhotoDoodlePipeline +import torch + +pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev") +pipeline = pipeline.to("cuda") +# Load initial model weights +pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors") +pipeline.fuse_lora() +pipeline.unload_lora_weights() + +pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors") + +# Generate image with text prompt and condition image +prompt = "add a halo and wings for the cat by sksmagiceffects" +condition_image = load_image("path/to/condition.jpg") # PIL Image +output = pipeline( + prompt=prompt, + condition_image=condition_image, + num_inference_steps=28, + guidance_scale=3.5 +) + +# Save the generated image +output.images[0].save("generated_image.png") +``` + +## Parameters + +- `prompt`: Text prompt for image generation +- `prompt_2`: Optional secondary prompt for T5 encoder +- `condition_image`: Input image for conditioning +- `height`: Output image height (default: 512) +- `width`: Output image width (default: 512) +- `num_inference_steps`: Number of denoising steps (default: 28) +- `guidance_scale`: Classifier-free guidance scale (default: 3.5) +- `num_images_per_prompt`: Number of images to generate per prompt +- `generator`: Random number generator for reproducibility +- `output_type`: Output format ("pil", "latent", or "pt") + +## Features + +- Dual text encoder architecture (CLIP + T5) +- Image conditioning support +- Position encoding for better spatial understanding +- Support for LoRA fine-tuning +- VAE slicing and tiling for memory efficiency +- Progress bar during generation +- Callback support for step-by-step monitoring \ No newline at end of file diff --git a/src/diffusers/pipelines/photodoodle/__init__.py b/src/diffusers/pipelines/photodoodle/__init__.py new file mode 100644 index 000000000000..d5dac3eb75d9 --- /dev/null +++ b/src/diffusers/pipelines/photodoodle/__init__.py @@ -0,0 +1,39 @@ +""" +PhotoDoodle pipeline for image generation. +""" + +from typing import TYPE_CHECKING + +from ...utils import ( + DIFFUSERS_SLOW_IMPORT, + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, + is_transformers_available, +) + +_dummy_objects = {} +_import_structure = { + "pipeline_photodoodle": ["PhotoDoodlePipeline"], +} + +try: + if not (is_torch_available() and is_transformers_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 +else: + from .pipeline_photodoodle import PhotoDoodlePipeline + +if TYPE_CHECKING: + from .pipeline_photodoodle import PhotoDoodlePipeline + +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) \ No newline at end of file diff --git a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py new file mode 100644 index 000000000000..d7b46d9cb7bb --- /dev/null +++ b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py @@ -0,0 +1,613 @@ +""" +PhotoDoodle pipeline for image generation. +""" + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import torch +from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast + +from ...image_processor import VaeImageProcessor +from ...loaders import FluxLoraLoaderMixin, FromSingleFileMixin +from ...models.autoencoders import AutoencoderKL +from ...models.transformers import FluxTransformer2DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import ( + USE_PEFT_BACKEND, + is_torch_xla_available, + logging, + scale_lora_layers, + unscale_lora_layers, +) +from ...utils.torch_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from ..flux.pipeline_output import FluxPipelineOutput + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +def calculate_shift( + image_seq_len, + base_seq_len: int = 256, + max_seq_len: int = 4096, + base_shift: float = 0.5, + max_shift: float = 1.16, +): + m = (max_shift - base_shift) / (max_seq_len - base_seq_len) + b = base_shift - m * base_seq_len + mu = image_seq_len * m + b + return mu + +def prepare_latent_image_ids_2(height, width, device, dtype): + latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None] # y coordinate + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :] # x coordinate + return latent_image_ids + +def position_encoding_clone(batch_size, original_height, original_width, device, dtype): + latent_image_ids = prepare_latent_image_ids_2(original_height, original_width, device, dtype) + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + latent_image_ids = latent_image_ids.reshape( + latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + cond_latent_image_ids = latent_image_ids + latent_image_ids = torch.concat([latent_image_ids, cond_latent_image_ids], dim=-2) + return latent_image_ids + +def retrieve_latents( + encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """Retrieve timesteps from scheduler and handle custom timesteps/sigmas.""" + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + +class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin): + r""" + PhotoDoodle pipeline for image generation. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines. + + Args: + transformer ([`FluxTransformer2DModel`]): + Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. + scheduler ([`FlowMatchEulerDiscreteScheduler`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + text_encoder_2 ([`T5EncoderModel`]): + Second frozen text encoder ([t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl)). + tokenizer (`CLIPTokenizer`): + A tokenizer for the text encoder. + tokenizer_2 (`T5TokenizerFast`): + A tokenizer for the second text encoder. + """ + + model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae" + _optional_components = [] + _callback_tensor_inputs = ["latents", "prompt_embeds"] + + def __init__( + self, + scheduler: FlowMatchEulerDiscreteScheduler, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + text_encoder_2: T5EncoderModel, + tokenizer_2: T5TokenizerFast, + transformer: FluxTransformer2DModel, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_scale_factor = ( + 2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16 + ) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.tokenizer_max_length = ( + self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77 + ) + self.default_sample_size = 64 + + def _get_t5_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + num_images_per_prompt: int = 1, + max_sequence_length: int = 512, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + device = device or self._execution_device + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = self.tokenizer_2( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + return_length=False, + return_overflowing_tokens=False, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1]) + logger.warning( + "The following part of your input was truncated because `max_sequence_length` is set to " + f" {max_sequence_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0] + + dtype = self.text_encoder_2.dtype + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + _, seq_len, _ = prompt_embeds.shape + + # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + return prompt_embeds + + def _get_clip_prompt_embeds( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + device: Optional[torch.device] = None, + ): + device = device or self._execution_device + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer_max_length, + truncation=True, + return_overflowing_tokens=False, + return_length=False, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer_max_length} tokens: {removed_text}" + ) + prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False) + + # Use pooled output of CLIPTextModel + prompt_embeds = prompt_embeds.pooler_output + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1) + + return prompt_embeds + + def encode_prompt( + self, + prompt: Union[str, List[str]], + prompt_2: Union[str, List[str]], + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + max_sequence_length: int = 512, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in all text-encoders + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + device = device or self._execution_device + + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + # We only use the pooled prompt output from the CLIPTextModel + pooled_prompt_embeds = self._get_clip_prompt_embeds( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + ) + prompt_embeds = self._get_t5_prompt_embeds( + prompt=prompt_2, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + ) + + if self.text_encoder is not None: + if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + + dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype + text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype) + + return prompt_embeds, pooled_prompt_embeds, text_ids + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + condition_image=None, + ): + height = 2 * (int(height) // self.vae_scale_factor) + width = 2 * (int(width) // self.vae_scale_factor) + + shape = (batch_size, num_channels_latents, height, width) # 1 16 106 80 + + if latents is not None: + latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype) + return latents.to(device=device, dtype=dtype), latent_image_ids + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if condition_image is not None: + condition_image = condition_image.to(device=device, dtype=dtype) + image_latents = self._encode_vae_image(image=condition_image, generator=generator) + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = torch.cat([image_latents], dim=0) + + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) + cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width) + latents = torch.concat([latents, cond_latents], dim=-2) + + latent_image_ids = position_encoding_clone(batch_size, height, width, device, dtype) # add position + + mask1 = torch.ones(shape, device=device, dtype=dtype) + mask2 = torch.zeros(shape, device=device, dtype=dtype) + mask1 = self._pack_latents(mask1, batch_size, num_channels_latents, height, width) # 1 4096 64 + mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height, width) # 1 4096 64 + mask = torch.concat([mask1, mask2], dim=-2) + return latents, latent_image_ids, mask, cond_latents + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def joint_attention_kwargs(self): + return self._joint_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 28, + timesteps: List[int] = None, + guidance_scale: float = 3.5, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 512, + condition_image=None, + ): + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._joint_attention_kwargs = joint_attention_kwargs + self._interrupt = False + + condition_image = self.image_processor.preprocess(condition_image, height=height, width=width) + condition_image = condition_image.to(dtype=torch.float32) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + lora_scale = ( + self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None + ) + ( + prompt_embeds, + pooled_prompt_embeds, + text_ids, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + device=device, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + lora_scale=lora_scale, + ) + + # 4. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 # 16 + latents, latent_image_ids, mask, cond_latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + condition_image + ) + clean_latents = latents.clone() + + # 5. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + image_seq_len = latents.shape[1] + mu = calculate_shift( + image_seq_len, + self.scheduler.config.base_image_seq_len, + self.scheduler.config.max_image_seq_len, + self.scheduler.config.base_shift, + self.scheduler.config.max_shift, + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + timesteps, + sigmas, + mu=mu, + ) + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + self._num_timesteps = len(timesteps) + + # handle guidance + if self.transformer.config.guidance_embeds: + guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32) + guidance = guidance.expand(latents.shape[0]) + else: + guidance = None + + # 6. Denoising loop + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand(latents.shape[0]).to(latents.dtype) + noise_pred = self.transformer( + hidden_states=latents, # 1 4096 64 + timestep=timestep / 1000, + guidance=guidance, + pooled_projections=pooled_prompt_embeds, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, + img_ids=latent_image_ids, + joint_attention_kwargs=self.joint_attention_kwargs, + return_dict=False, + )[0] + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + latents = latents * mask + clean_latents * (1 - mask) + + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + if XLA_AVAILABLE: + xm.mark_step() + + if output_type == "latent": + image = latents + + else: + latents = self._unpack_latents(latents[:,:latents.shape[-2]-cond_latents.shape[-2],:], height, width, self.vae_scale_factor) + latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor + image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0] + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image,) + + return FluxPipelineOutput(images=image) \ No newline at end of file diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 159d81add355..e69534ac6aa5 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -2955,3 +2955,18 @@ def from_config(cls, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) + + +class PhotoDoodlePipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) From 1331f56da5c9729b8faa52fe238fc3b616b7908f Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 28 May 2025 12:25:13 +0530 Subject: [PATCH 2/4] photodoodle added --- src/diffusers/utils/dummy_torch_and_transformers_objects.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index e69534ac6aa5..922a17ab1621 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -2955,6 +2955,7 @@ def from_config(cls, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) + class PhotoDoodlePipeline(metaclass=DummyObject): From 5c9814ce093c3b66cfd00d0b128aac6b2119ea19 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 28 May 2025 13:00:11 +0530 Subject: [PATCH 3/4] PhotoDoodle by Ameer --- src/diffusers/__init__.py | 3 + src/diffusers/pipelines/__init__.py | 1574 ++++++++++------- .../photodoodle/pipeline_photodoodle.py | 178 +- 3 files changed, 1146 insertions(+), 609 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 8c4ae36c5654..68dd3e69870e 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -38,6 +38,7 @@ "loaders": ["FromOriginalModelMixin"], "models": [], "pipelines": [], + "pipelines.photodoodle": ["PhotoDoodlePipeline"], "quantizers.quantization_config": [], "schedulers": [], "utils": [ @@ -340,6 +341,7 @@ "AnimateDiffControlNetPipeline", "AnimateDiffPAGPipeline", "AnimateDiffPipeline", + "PhotoDoodlePipeline", "AnimateDiffSDXLPipeline", "AnimateDiffSparseControlNetPipeline", "AnimateDiffVideoToVideoControlNetPipeline", @@ -927,6 +929,7 @@ AnimateDiffControlNetPipeline, AnimateDiffPAGPipeline, AnimateDiffPipeline, + PhotoDoodlePipeline, AnimateDiffSDXLPipeline, AnimateDiffSparseControlNetPipeline, AnimateDiffVideoToVideoControlNetPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 4e20335b5f55..68dd3e69870e 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,401 +1,596 @@ +__version__ = "0.34.0.dev0" + from typing import TYPE_CHECKING -from ..utils import ( +from .utils import ( DIFFUSERS_SLOW_IMPORT, OptionalDependencyNotAvailable, _LazyModule, - get_objects_from_module, + is_accelerate_available, + is_bitsandbytes_available, is_flax_available, + is_gguf_available, is_k_diffusion_available, is_librosa_available, is_note_seq_available, is_onnx_available, is_opencv_available, + is_optimum_quanto_available, + is_scipy_available, is_sentencepiece_available, is_torch_available, - is_torch_npu_available, + is_torchao_available, + is_torchsde_available, is_transformers_available, ) -# These modules contain pipelines from multiple libraries/frameworks -_dummy_objects = {} +# Lazy Import based on +# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py + +# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names, +# and is used to defer the actual importing for when the objects are requested. +# This way `import diffusers` provides the names in the namespace without actually importing anything (and especially none of the backends). + _import_structure = { - "controlnet": [], - "controlnet_hunyuandit": [], - "controlnet_sd3": [], - "controlnet_xs": [], - "deprecated": [], - "latent_diffusion": [], - "ledits_pp": [], - "marigold": [], - "pag": [], - "photodoodle": [], - "stable_diffusion": [], - "stable_diffusion_xl": [], + "configuration_utils": ["ConfigMixin"], + "hooks": [], + "loaders": ["FromOriginalModelMixin"], + "models": [], + "pipelines": [], + "pipelines.photodoodle": ["PhotoDoodlePipeline"], + "quantizers.quantization_config": [], + "schedulers": [], + "utils": [ + "OptionalDependencyNotAvailable", + "is_flax_available", + "is_inflect_available", + "is_invisible_watermark_available", + "is_k_diffusion_available", + "is_k_diffusion_version", + "is_librosa_available", + "is_note_seq_available", + "is_onnx_available", + "is_scipy_available", + "is_torch_available", + "is_torchsde_available", + "is_transformers_available", + "is_transformers_version", + "is_unidecode_available", + "logging", + ], } try: - if not is_torch_available(): + if not is_torch_available() and not is_accelerate_available() and not is_bitsandbytes_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_pt_objects # noqa F403 + from .utils import dummy_bitsandbytes_objects - _dummy_objects.update(get_objects_from_module(dummy_pt_objects)) + _import_structure["utils.dummy_bitsandbytes_objects"] = [ + name for name in dir(dummy_bitsandbytes_objects) if not name.startswith("_") + ] else: - _import_structure["auto_pipeline"] = [ - "AutoPipelineForImage2Image", - "AutoPipelineForInpainting", - "AutoPipelineForText2Image", + _import_structure["quantizers.quantization_config"].append("BitsAndBytesConfig") + +try: + if not is_torch_available() and not is_accelerate_available() and not is_gguf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_gguf_objects + + _import_structure["utils.dummy_gguf_objects"] = [ + name for name in dir(dummy_gguf_objects) if not name.startswith("_") ] - _import_structure["consistency_models"] = ["ConsistencyModelPipeline"] - _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"] - _import_structure["ddim"] = ["DDIMPipeline"] - _import_structure["ddpm"] = ["DDPMPipeline"] - _import_structure["dit"] = ["DiTPipeline"] - _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"]) - _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"]) - _import_structure["pipeline_utils"] = [ - "AudioPipelineOutput", - "DiffusionPipeline", - "StableDiffusionMixin", - "ImagePipelineOutput", +else: + _import_structure["quantizers.quantization_config"].append("GGUFQuantizationConfig") + +try: + if not is_torch_available() and not is_accelerate_available() and not is_torchao_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torchao_objects + + _import_structure["utils.dummy_torchao_objects"] = [ + name for name in dir(dummy_torchao_objects) if not name.startswith("_") ] - _import_structure["deprecated"].extend( - [ - "PNDMPipeline", - "LDMPipeline", - "RePaintPipeline", - "ScoreSdeVePipeline", - "KarrasVePipeline", - ] - ) +else: + _import_structure["quantizers.quantization_config"].append("TorchAoConfig") + try: - if not (is_torch_available() and is_librosa_available()): + if not is_torch_available() and not is_accelerate_available() and not is_optimum_quanto_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_torch_and_librosa_objects # noqa F403 + from .utils import dummy_optimum_quanto_objects - _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects)) + _import_structure["utils.dummy_optimum_quanto_objects"] = [ + name for name in dir(dummy_optimum_quanto_objects) if not name.startswith("_") + ] else: - _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"]) + _import_structure["quantizers.quantization_config"].append("QuantoConfig") try: - if not (is_transformers_available() and is_torch_available() and is_note_seq_available()): + if not is_onnx_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_transformers_and_torch_and_note_seq_objects # noqa F403 + from .utils import dummy_onnx_objects # noqa F403 + + _import_structure["utils.dummy_onnx_objects"] = [ + name for name in dir(dummy_onnx_objects) if not name.startswith("_") + ] - _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects)) else: - _import_structure["deprecated"].extend( - [ - "MidiProcessor", - "SpectrogramDiffusionPipeline", - ] - ) + _import_structure["pipelines"].extend(["OnnxRuntimeModel"]) try: - if not (is_torch_available() and is_transformers_available()): + if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_torch_and_transformers_objects # noqa F403 + from .utils import dummy_pt_objects # noqa F403 + + _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")] - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["deprecated"].extend( - [ - "VQDiffusionPipeline", - "AltDiffusionPipeline", - "AltDiffusionImg2ImgPipeline", - "CycleDiffusionPipeline", - "StableDiffusionInpaintPipelineLegacy", - "StableDiffusionPix2PixZeroPipeline", - "StableDiffusionParadigmsPipeline", - "StableDiffusionModelEditingPipeline", - "VersatileDiffusionDualGuidedPipeline", - "VersatileDiffusionImageVariationPipeline", - "VersatileDiffusionPipeline", - "VersatileDiffusionTextToImagePipeline", - ] - ) - _import_structure["allegro"] = ["AllegroPipeline"] - _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"] - _import_structure["animatediff"] = [ - "AnimateDiffPipeline", - "AnimateDiffControlNetPipeline", - "AnimateDiffSDXLPipeline", - "AnimateDiffSparseControlNetPipeline", - "AnimateDiffVideoToVideoPipeline", - "AnimateDiffVideoToVideoControlNetPipeline", - ] - _import_structure["flux"] = [ - "FluxControlPipeline", - "FluxControlInpaintPipeline", - "FluxControlImg2ImgPipeline", - "FluxControlNetPipeline", - "FluxControlNetImg2ImgPipeline", - "FluxControlNetInpaintPipeline", - "FluxImg2ImgPipeline", - "FluxInpaintPipeline", - "FluxPipeline", - "FluxFillPipeline", - "FluxPriorReduxPipeline", - "ReduxImageEncoder", - ] - _import_structure["audioldm"] = ["AudioLDMPipeline"] - _import_structure["audioldm2"] = [ - "AudioLDM2Pipeline", - "AudioLDM2ProjectionModel", - "AudioLDM2UNet2DConditionModel", - ] - _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"] - _import_structure["cogvideo"] = [ - "CogVideoXPipeline", - "CogVideoXImageToVideoPipeline", - "CogVideoXVideoToVideoPipeline", - "CogVideoXFunControlPipeline", - ] - _import_structure["cogview3"] = ["CogView3PlusPipeline"] - _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"] - _import_structure["consisid"] = ["ConsisIDPipeline"] - _import_structure["cosmos"] = ["CosmosTextToWorldPipeline", "CosmosVideoToWorldPipeline"] - _import_structure["controlnet"].extend( - [ - "BlipDiffusionControlNetPipeline", - "StableDiffusionControlNetImg2ImgPipeline", - "StableDiffusionControlNetInpaintPipeline", - "StableDiffusionControlNetPipeline", - "StableDiffusionXLControlNetImg2ImgPipeline", - "StableDiffusionXLControlNetInpaintPipeline", - "StableDiffusionXLControlNetPipeline", - "StableDiffusionXLControlNetUnionPipeline", - "StableDiffusionXLControlNetUnionInpaintPipeline", - "StableDiffusionXLControlNetUnionImg2ImgPipeline", - ] - ) - _import_structure["pag"].extend( + _import_structure["hooks"].extend( [ - "StableDiffusionControlNetPAGInpaintPipeline", - "AnimateDiffPAGPipeline", - "KolorsPAGPipeline", - "HunyuanDiTPAGPipeline", - "StableDiffusion3PAGPipeline", - "StableDiffusion3PAGImg2ImgPipeline", - "StableDiffusionPAGPipeline", - "StableDiffusionPAGImg2ImgPipeline", - "StableDiffusionPAGInpaintPipeline", - "StableDiffusionControlNetPAGPipeline", - "StableDiffusionXLPAGPipeline", - "StableDiffusionXLPAGInpaintPipeline", - "StableDiffusionXLControlNetPAGImg2ImgPipeline", - "StableDiffusionXLControlNetPAGPipeline", - "StableDiffusionXLPAGImg2ImgPipeline", - "PixArtSigmaPAGPipeline", - "SanaPAGPipeline", + "FasterCacheConfig", + "HookRegistry", + "PyramidAttentionBroadcastConfig", + "apply_faster_cache", + "apply_pyramid_attention_broadcast", ] ) - _import_structure["controlnet_xs"].extend( + _import_structure["models"].extend( [ - "StableDiffusionControlNetXSPipeline", - "StableDiffusionXLControlNetXSPipeline", + "AllegroTransformer3DModel", + "AsymmetricAutoencoderKL", + "AuraFlowTransformer2DModel", + "AutoencoderDC", + "AutoencoderKL", + "AutoencoderKLAllegro", + "AutoencoderKLCogVideoX", + "AutoencoderKLCosmos", + "AutoencoderKLHunyuanVideo", + "AutoencoderKLLTXVideo", + "AutoencoderKLMagvit", + "AutoencoderKLMochi", + "AutoencoderKLTemporalDecoder", + "AutoencoderKLWan", + "AutoencoderOobleck", + "AutoencoderTiny", + "AutoModel", + "CacheMixin", + "CogVideoXTransformer3DModel", + "CogView3PlusTransformer2DModel", + "CogView4Transformer2DModel", + "ConsisIDTransformer3DModel", + "ConsistencyDecoderVAE", + "ControlNetModel", + "ControlNetUnionModel", + "ControlNetXSAdapter", + "CosmosTransformer3DModel", + "DiTTransformer2DModel", + "EasyAnimateTransformer3DModel", + "FluxControlNetModel", + "FluxMultiControlNetModel", + "FluxTransformer2DModel", + "HiDreamImageTransformer2DModel", + "HunyuanDiT2DControlNetModel", + "HunyuanDiT2DModel", + "HunyuanDiT2DMultiControlNetModel", + "HunyuanVideoFramepackTransformer3DModel", + "HunyuanVideoTransformer3DModel", + "I2VGenXLUNet", + "Kandinsky3UNet", + "LatteTransformer3DModel", + "LTXVideoTransformer3DModel", + "Lumina2Transformer2DModel", + "LuminaNextDiT2DModel", + "MochiTransformer3DModel", + "ModelMixin", + "MotionAdapter", + "MultiAdapter", + "MultiControlNetModel", + "OmniGenTransformer2DModel", + "PixArtTransformer2DModel", + "PriorTransformer", + "SanaControlNetModel", + "SanaTransformer2DModel", + "SD3ControlNetModel", + "SD3MultiControlNetModel", + "SD3Transformer2DModel", + "SparseControlNetModel", + "StableAudioDiTModel", + "StableCascadeUNet", + "T2IAdapter", + "T5FilmDecoder", + "Transformer2DModel", + "TransformerTemporalModel", + "UNet1DModel", + "UNet2DConditionModel", + "UNet2DModel", + "UNet3DConditionModel", + "UNetControlNetXSModel", + "UNetMotionModel", + "UNetSpatioTemporalConditionModel", + "UVit2DModel", + "VQModel", + "WanTransformer3DModel", ] ) - _import_structure["controlnet_hunyuandit"].extend( + _import_structure["optimization"] = [ + "get_constant_schedule", + "get_constant_schedule_with_warmup", + "get_cosine_schedule_with_warmup", + "get_cosine_with_hard_restarts_schedule_with_warmup", + "get_linear_schedule_with_warmup", + "get_polynomial_decay_schedule_with_warmup", + "get_scheduler", + ] + _import_structure["pipelines"].extend( [ - "HunyuanDiTControlNetPipeline", + "AudioPipelineOutput", + "AutoPipelineForImage2Image", + "AutoPipelineForInpainting", + "AutoPipelineForText2Image", + "ConsistencyModelPipeline", + "DanceDiffusionPipeline", + "DDIMPipeline", + "DDPMPipeline", + "DiffusionPipeline", + "DiTPipeline", + "ImagePipelineOutput", + "KarrasVePipeline", + "LDMPipeline", + "LDMSuperResolutionPipeline", + "PNDMPipeline", + "RePaintPipeline", + "ScoreSdeVePipeline", + "StableDiffusionMixin", ] ) - _import_structure["controlnet_sd3"].extend( + _import_structure["quantizers"] = ["DiffusersQuantizer"] + _import_structure["schedulers"].extend( [ - "StableDiffusion3ControlNetPipeline", - "StableDiffusion3ControlNetInpaintingPipeline", + "AmusedScheduler", + "CMStochasticIterativeScheduler", + "CogVideoXDDIMScheduler", + "CogVideoXDPMScheduler", + "DDIMInverseScheduler", + "DDIMParallelScheduler", + "DDIMScheduler", + "DDPMParallelScheduler", + "DDPMScheduler", + "DDPMWuerstchenScheduler", + "DEISMultistepScheduler", + "DPMSolverMultistepInverseScheduler", + "DPMSolverMultistepScheduler", + "DPMSolverSinglestepScheduler", + "EDMDPMSolverMultistepScheduler", + "EDMEulerScheduler", + "EulerAncestralDiscreteScheduler", + "EulerDiscreteScheduler", + "FlowMatchEulerDiscreteScheduler", + "FlowMatchHeunDiscreteScheduler", + "FlowMatchLCMScheduler", + "HeunDiscreteScheduler", + "IPNDMScheduler", + "KarrasVeScheduler", + "KDPM2AncestralDiscreteScheduler", + "KDPM2DiscreteScheduler", + "LCMScheduler", + "PNDMScheduler", + "RePaintScheduler", + "SASolverScheduler", + "SchedulerMixin", + "SCMScheduler", + "ScoreSdeVeScheduler", + "TCDScheduler", + "UnCLIPScheduler", + "UniPCMultistepScheduler", + "VQDiffusionScheduler", ] ) - _import_structure["deepfloyd_if"] = [ - "IFImg2ImgPipeline", - "IFImg2ImgSuperResolutionPipeline", - "IFInpaintingPipeline", - "IFInpaintingSuperResolutionPipeline", - "IFPipeline", - "IFSuperResolutionPipeline", - ] - _import_structure["easyanimate"] = [ - "EasyAnimatePipeline", - "EasyAnimateInpaintPipeline", - "EasyAnimateControlPipeline", - ] - _import_structure["hidream_image"] = ["HiDreamImagePipeline"] - _import_structure["hunyuandit"] = ["HunyuanDiTPipeline"] - _import_structure["hunyuan_video"] = [ - "HunyuanVideoPipeline", - "HunyuanSkyreelsImageToVideoPipeline", - "HunyuanVideoImageToVideoPipeline", - "HunyuanVideoFramepackPipeline", - ] - _import_structure["kandinsky"] = [ - "KandinskyCombinedPipeline", - "KandinskyImg2ImgCombinedPipeline", - "KandinskyImg2ImgPipeline", - "KandinskyInpaintCombinedPipeline", - "KandinskyInpaintPipeline", - "KandinskyPipeline", - "KandinskyPriorPipeline", - ] - _import_structure["kandinsky2_2"] = [ - "KandinskyV22CombinedPipeline", - "KandinskyV22ControlnetImg2ImgPipeline", - "KandinskyV22ControlnetPipeline", - "KandinskyV22Img2ImgCombinedPipeline", - "KandinskyV22Img2ImgPipeline", - "KandinskyV22InpaintCombinedPipeline", - "KandinskyV22InpaintPipeline", - "KandinskyV22Pipeline", - "KandinskyV22PriorEmb2EmbPipeline", - "KandinskyV22PriorPipeline", + _import_structure["training_utils"] = ["EMAModel"] + +try: + if not (is_torch_available() and is_scipy_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torch_and_scipy_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_scipy_objects"] = [ + name for name in dir(dummy_torch_and_scipy_objects) if not name.startswith("_") ] - _import_structure["kandinsky3"] = [ - "Kandinsky3Img2ImgPipeline", - "Kandinsky3Pipeline", + +else: + _import_structure["schedulers"].extend(["LMSDiscreteScheduler"]) + +try: + if not (is_torch_available() and is_torchsde_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torch_and_torchsde_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_torchsde_objects"] = [ + name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_") ] - _import_structure["latent_consistency_models"] = [ - "LatentConsistencyModelImg2ImgPipeline", - "LatentConsistencyModelPipeline", + +else: + _import_structure["schedulers"].extend(["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"]) + +try: + if not (is_torch_available() and is_transformers_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torch_and_transformers_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_transformers_objects"] = [ + name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_") ] - _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"]) - _import_structure["ledits_pp"].extend( + +else: + _import_structure["pipelines"].extend( [ + "AllegroPipeline", + "AltDiffusionImg2ImgPipeline", + "AltDiffusionPipeline", + "AmusedImg2ImgPipeline", + "AmusedInpaintPipeline", + "AmusedPipeline", + "AnimateDiffControlNetPipeline", + "AnimateDiffPAGPipeline", + "AnimateDiffPipeline", + "PhotoDoodlePipeline", + "AnimateDiffSDXLPipeline", + "AnimateDiffSparseControlNetPipeline", + "AnimateDiffVideoToVideoControlNetPipeline", + "AnimateDiffVideoToVideoPipeline", + "AudioLDM2Pipeline", + "AudioLDM2ProjectionModel", + "AudioLDM2UNet2DConditionModel", + "AudioLDMPipeline", + "AuraFlowPipeline", + "BlipDiffusionControlNetPipeline", + "BlipDiffusionPipeline", + "CLIPImageProjection", + "CogVideoXFunControlPipeline", + "CogVideoXImageToVideoPipeline", + "CogVideoXPipeline", + "CogVideoXVideoToVideoPipeline", + "CogView3PlusPipeline", + "CogView4ControlPipeline", + "CogView4Pipeline", + "ConsisIDPipeline", + "CosmosTextToWorldPipeline", + "CosmosVideoToWorldPipeline", + "CycleDiffusionPipeline", + "EasyAnimateControlPipeline", + "EasyAnimateInpaintPipeline", + "EasyAnimatePipeline", + "FluxControlImg2ImgPipeline", + "FluxControlInpaintPipeline", + "FluxControlNetImg2ImgPipeline", + "FluxControlNetInpaintPipeline", + "FluxControlNetPipeline", + "FluxControlPipeline", + "FluxFillPipeline", + "FluxImg2ImgPipeline", + "FluxInpaintPipeline", + "FluxPipeline", + "FluxPriorReduxPipeline", + "HiDreamImagePipeline", + "HunyuanDiTControlNetPipeline", + "HunyuanDiTPAGPipeline", + "HunyuanDiTPipeline", + "HunyuanSkyreelsImageToVideoPipeline", + "HunyuanVideoFramepackPipeline", + "HunyuanVideoImageToVideoPipeline", + "HunyuanVideoPipeline", + "I2VGenXLPipeline", + "IFImg2ImgPipeline", + "IFImg2ImgSuperResolutionPipeline", + "IFInpaintingPipeline", + "IFInpaintingSuperResolutionPipeline", + "IFPipeline", + "IFSuperResolutionPipeline", + "ImageTextPipelineOutput", + "Kandinsky3Img2ImgPipeline", + "Kandinsky3Pipeline", + "KandinskyCombinedPipeline", + "KandinskyImg2ImgCombinedPipeline", + "KandinskyImg2ImgPipeline", + "KandinskyInpaintCombinedPipeline", + "KandinskyInpaintPipeline", + "KandinskyPipeline", + "KandinskyPriorPipeline", + "KandinskyV22CombinedPipeline", + "KandinskyV22ControlnetImg2ImgPipeline", + "KandinskyV22ControlnetPipeline", + "KandinskyV22Img2ImgCombinedPipeline", + "KandinskyV22Img2ImgPipeline", + "KandinskyV22InpaintCombinedPipeline", + "KandinskyV22InpaintPipeline", + "KandinskyV22Pipeline", + "KandinskyV22PriorEmb2EmbPipeline", + "KandinskyV22PriorPipeline", + "LatentConsistencyModelImg2ImgPipeline", + "LatentConsistencyModelPipeline", + "LattePipeline", + "LDMTextToImagePipeline", "LEditsPPPipelineStableDiffusion", "LEditsPPPipelineStableDiffusionXL", - ] - ) - _import_structure["latte"] = ["LattePipeline"] - _import_structure["ltx"] = [ - "LTXPipeline", - "LTXImageToVideoPipeline", - "LTXConditionPipeline", - "LTXLatentUpsamplePipeline", - ] - _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"] - _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"] - _import_structure["marigold"].extend( - [ + "LTXConditionPipeline", + "LTXImageToVideoPipeline", + "LTXLatentUpsamplePipeline", + "LTXPipeline", + "Lumina2Pipeline", + "Lumina2Text2ImgPipeline", + "LuminaPipeline", + "LuminaText2ImgPipeline", "MarigoldDepthPipeline", "MarigoldIntrinsicsPipeline", "MarigoldNormalsPipeline", - ] - ) - _import_structure["mochi"] = ["MochiPipeline"] - _import_structure["musicldm"] = ["MusicLDMPipeline"] - _import_structure["omnigen"] = ["OmniGenPipeline"] - _import_structure["photodoodle"].extend(["PhotoDoodlePipeline"]) - _import_structure["visualcloze"] = ["VisualClozePipeline", "VisualClozeGenerationPipeline"] - _import_structure["paint_by_example"] = ["PaintByExamplePipeline"] - _import_structure["pia"] = ["PIAPipeline"] - _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"] - _import_structure["sana"] = [ - "SanaPipeline", - "SanaSprintPipeline", - "SanaControlNetPipeline", - "SanaSprintImg2ImgPipeline", - ] - _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"] - _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"] - _import_structure["stable_audio"] = [ - "StableAudioProjectionModel", - "StableAudioPipeline", - ] - _import_structure["stable_cascade"] = [ - "StableCascadeCombinedPipeline", - "StableCascadeDecoderPipeline", - "StableCascadePriorPipeline", - ] - _import_structure["stable_diffusion"].extend( - [ - "CLIPImageProjection", + "MochiPipeline", + "MusicLDMPipeline", + "OmniGenPipeline", + "PaintByExamplePipeline", + "PIAPipeline", + "PixArtAlphaPipeline", + "PixArtSigmaPAGPipeline", + "PixArtSigmaPipeline", + "ReduxImageEncoder", + "SanaControlNetPipeline", + "SanaPAGPipeline", + "SanaPipeline", + "SanaSprintImg2ImgPipeline", + "SanaSprintPipeline", + "SemanticStableDiffusionPipeline", + "ShapEImg2ImgPipeline", + "ShapEPipeline", + "StableAudioPipeline", + "StableAudioProjectionModel", + "StableCascadeCombinedPipeline", + "StableCascadeDecoderPipeline", + "StableCascadePriorPipeline", + "StableDiffusion3ControlNetInpaintingPipeline", + "StableDiffusion3ControlNetPipeline", + "StableDiffusion3Img2ImgPipeline", + "StableDiffusion3InpaintPipeline", + "StableDiffusion3PAGImg2ImgPipeline", + "StableDiffusion3PAGImg2ImgPipeline", + "StableDiffusion3PAGPipeline", + "StableDiffusion3Pipeline", + "StableDiffusionAdapterPipeline", + "StableDiffusionAttendAndExcitePipeline", + "StableDiffusionControlNetImg2ImgPipeline", + "StableDiffusionControlNetInpaintPipeline", + "StableDiffusionControlNetPAGInpaintPipeline", + "StableDiffusionControlNetPAGPipeline", + "StableDiffusionControlNetPipeline", + "StableDiffusionControlNetXSPipeline", "StableDiffusionDepth2ImgPipeline", + "StableDiffusionDiffEditPipeline", + "StableDiffusionGLIGENPipeline", + "StableDiffusionGLIGENTextImagePipeline", "StableDiffusionImageVariationPipeline", "StableDiffusionImg2ImgPipeline", "StableDiffusionInpaintPipeline", + "StableDiffusionInpaintPipelineLegacy", "StableDiffusionInstructPix2PixPipeline", "StableDiffusionLatentUpscalePipeline", + "StableDiffusionLDM3DPipeline", + "StableDiffusionModelEditingPipeline", + "StableDiffusionPAGImg2ImgPipeline", + "StableDiffusionPAGInpaintPipeline", + "StableDiffusionPAGPipeline", + "StableDiffusionPanoramaPipeline", + "StableDiffusionParadigmsPipeline", "StableDiffusionPipeline", + "StableDiffusionPipelineSafe", + "StableDiffusionPix2PixZeroPipeline", + "StableDiffusionSAGPipeline", "StableDiffusionUpscalePipeline", - "StableUnCLIPImg2ImgPipeline", - "StableUnCLIPPipeline", - "StableDiffusionLDM3DPipeline", - ] - ) - _import_structure["aura_flow"] = ["AuraFlowPipeline"] - _import_structure["stable_diffusion_3"] = [ - "StableDiffusion3Pipeline", - "StableDiffusion3Img2ImgPipeline", - "StableDiffusion3InpaintPipeline", - ] - _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"] - _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"] - _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"] - _import_structure["stable_diffusion_gligen"] = [ - "StableDiffusionGLIGENPipeline", - "StableDiffusionGLIGENTextImagePipeline", - ] - _import_structure["stable_video_diffusion"] = ["StableVideoDiffusionPipeline"] - _import_structure["stable_diffusion_xl"].extend( - [ + "StableDiffusionXLAdapterPipeline", + "StableDiffusionXLControlNetImg2ImgPipeline", + "StableDiffusionXLControlNetInpaintPipeline", + "StableDiffusionXLControlNetPAGImg2ImgPipeline", + "StableDiffusionXLControlNetPAGPipeline", + "StableDiffusionXLControlNetPipeline", + "StableDiffusionXLControlNetUnionImg2ImgPipeline", + "StableDiffusionXLControlNetUnionInpaintPipeline", + "StableDiffusionXLControlNetUnionPipeline", + "StableDiffusionXLControlNetXSPipeline", "StableDiffusionXLImg2ImgPipeline", "StableDiffusionXLInpaintPipeline", "StableDiffusionXLInstructPix2PixPipeline", + "StableDiffusionXLPAGImg2ImgPipeline", + "StableDiffusionXLPAGInpaintPipeline", + "StableDiffusionXLPAGPipeline", "StableDiffusionXLPipeline", + "StableUnCLIPImg2ImgPipeline", + "StableUnCLIPPipeline", + "StableVideoDiffusionPipeline", + "TextToVideoSDPipeline", + "TextToVideoZeroPipeline", + "TextToVideoZeroSDXLPipeline", + "UnCLIPImageVariationPipeline", + "UnCLIPPipeline", + "UniDiffuserModel", + "UniDiffuserPipeline", + "UniDiffuserTextDecoder", + "VersatileDiffusionDualGuidedPipeline", + "VersatileDiffusionImageVariationPipeline", + "VersatileDiffusionPipeline", + "VersatileDiffusionTextToImagePipeline", + "VideoToVideoSDPipeline", + "VisualClozeGenerationPipeline", + "VisualClozePipeline", + "VQDiffusionPipeline", + "WanImageToVideoPipeline", + "WanPipeline", + "WanVideoToVideoPipeline", + "WuerstchenCombinedPipeline", + "WuerstchenDecoderPipeline", + "WuerstchenPriorPipeline", ] ) - _import_structure["stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"] - _import_structure["stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"] - _import_structure["stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"] - _import_structure["t2i_adapter"] = [ - "StableDiffusionAdapterPipeline", - "StableDiffusionXLAdapterPipeline", - ] - _import_structure["text_to_video_synthesis"] = [ - "TextToVideoSDPipeline", - "TextToVideoZeroPipeline", - "TextToVideoZeroSDXLPipeline", - "VideoToVideoSDPipeline", - ] - _import_structure["i2vgen_xl"] = ["I2VGenXLPipeline"] - _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"] - _import_structure["unidiffuser"] = [ - "ImageTextPipelineOutput", - "UniDiffuserModel", - "UniDiffuserPipeline", - "UniDiffuserTextDecoder", + +try: + if not (is_torch_available() and is_transformers_available() and is_opencv_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torch_and_transformers_and_opencv_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_transformers_and_opencv_objects"] = [ + name for name in dir(dummy_torch_and_transformers_and_opencv_objects) if not name.startswith("_") ] - _import_structure["wuerstchen"] = [ - "WuerstchenCombinedPipeline", - "WuerstchenDecoderPipeline", - "WuerstchenPriorPipeline", + +else: + _import_structure["pipelines"].extend(["ConsisIDPipeline"]) + +try: + if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torch_and_transformers_and_k_diffusion_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [ + name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_") ] - _import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline"] + +else: + _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"]) + try: - if not is_onnx_available(): + if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_onnx_objects # noqa F403 + from .utils import dummy_torch_and_transformers_and_sentencepiece_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_transformers_and_sentencepiece_objects"] = [ + name for name in dir(dummy_torch_and_transformers_and_sentencepiece_objects) if not name.startswith("_") + ] - _dummy_objects.update(get_objects_from_module(dummy_onnx_objects)) else: - _import_structure["onnx_utils"] = ["OnnxRuntimeModel"] + _import_structure["pipelines"].extend(["KolorsImg2ImgPipeline", "KolorsPAGPipeline", "KolorsPipeline"]) + try: if not (is_torch_available() and is_transformers_available() and is_onnx_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_torch_and_transformers_and_onnx_objects # noqa F403 + from .utils import dummy_torch_and_transformers_and_onnx_objects # noqa F403 + + _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [ + name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_") + ] - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects)) else: - _import_structure["stable_diffusion"].extend( + _import_structure["pipelines"].extend( [ "OnnxStableDiffusionImg2ImgPipeline", "OnnxStableDiffusionInpaintPipeline", + "OnnxStableDiffusionInpaintPipelineLegacy", "OnnxStableDiffusionPipeline", "OnnxStableDiffusionUpscalePipeline", "StableDiffusionOnnxPipeline", @@ -403,195 +598,362 @@ ) try: - if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): + if not (is_torch_available() and is_librosa_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import ( - dummy_torch_and_transformers_and_k_diffusion_objects, - ) + from .utils import dummy_torch_and_librosa_objects # noqa F403 - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects)) -else: - _import_structure["stable_diffusion_k_diffusion"] = [ - "StableDiffusionKDiffusionPipeline", - "StableDiffusionXLKDiffusionPipeline", + _import_structure["utils.dummy_torch_and_librosa_objects"] = [ + name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_") ] +else: + _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"]) + try: - if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()): + if not (is_transformers_available() and is_torch_available() and is_note_seq_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import ( - dummy_torch_and_transformers_and_sentencepiece_objects, - ) + from .utils import dummy_transformers_and_torch_and_note_seq_objects # noqa F403 - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_sentencepiece_objects)) -else: - _import_structure["kolors"] = [ - "KolorsPipeline", - "KolorsImg2ImgPipeline", + _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [ + name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_") ] -try: - if not (is_torch_available() and is_transformers_available() and is_opencv_available()): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from ..utils import ( - dummy_torch_and_transformers_and_opencv_objects, - ) - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_opencv_objects)) else: - _import_structure["consisid"] = ["ConsisIDPipeline"] + _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"]) try: if not is_flax_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_flax_objects # noqa F403 + from .utils import dummy_flax_objects # noqa F403 + + _import_structure["utils.dummy_flax_objects"] = [ + name for name in dir(dummy_flax_objects) if not name.startswith("_") + ] + - _dummy_objects.update(get_objects_from_module(dummy_flax_objects)) else: - _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"] + _import_structure["models.controlnets.controlnet_flax"] = ["FlaxControlNetModel"] + _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"] + _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"] + _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"] + _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"]) + _import_structure["schedulers"].extend( + [ + "FlaxDDIMScheduler", + "FlaxDDPMScheduler", + "FlaxDPMSolverMultistepScheduler", + "FlaxEulerDiscreteScheduler", + "FlaxKarrasVeScheduler", + "FlaxLMSDiscreteScheduler", + "FlaxPNDMScheduler", + "FlaxSchedulerMixin", + "FlaxScoreSdeVeScheduler", + ] + ) + + try: if not (is_flax_available() and is_transformers_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils import dummy_flax_and_transformers_objects # noqa F403 + from .utils import dummy_flax_and_transformers_objects # noqa F403 + + _import_structure["utils.dummy_flax_and_transformers_objects"] = [ + name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_") + ] + - _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects)) else: - _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"]) - _import_structure["stable_diffusion"].extend( + _import_structure["pipelines"].extend( [ + "FlaxStableDiffusionControlNetPipeline", "FlaxStableDiffusionImg2ImgPipeline", "FlaxStableDiffusionInpaintPipeline", "FlaxStableDiffusionPipeline", - ] - ) - _import_structure["stable_diffusion_xl"].extend( - [ "FlaxStableDiffusionXLPipeline", ] ) +try: + if not (is_note_seq_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_note_seq_objects # noqa F403 + + _import_structure["utils.dummy_note_seq_objects"] = [ + name for name in dir(dummy_note_seq_objects) if not name.startswith("_") + ] + + +else: + _import_structure["pipelines"].extend(["MidiProcessor"]) + if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: + from .configuration_utils import ConfigMixin + try: - if not is_torch_available(): + if not is_bitsandbytes_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_bitsandbytes_objects import * + else: + from .quantizers.quantization_config import BitsAndBytesConfig + + try: + if not is_gguf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_gguf_objects import * + else: + from .quantizers.quantization_config import GGUFQuantizationConfig + + try: + if not is_torchao_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torchao_objects import * + else: + from .quantizers.quantization_config import TorchAoConfig + + try: + if not is_optimum_quanto_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils.dummy_pt_objects import * # noqa F403 + from .utils.dummy_optimum_quanto_objects import * + else: + from .quantizers.quantization_config import QuantoConfig + try: + if not is_onnx_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_onnx_objects import * # noqa F403 else: - from .auto_pipeline import ( + from .pipelines import OnnxRuntimeModel + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_pt_objects import * # noqa F403 + else: + from .hooks import ( + FasterCacheConfig, + HookRegistry, + PyramidAttentionBroadcastConfig, + apply_faster_cache, + apply_pyramid_attention_broadcast, + ) + from .models import ( + AllegroTransformer3DModel, + AsymmetricAutoencoderKL, + AuraFlowTransformer2DModel, + AutoencoderDC, + AutoencoderKL, + AutoencoderKLAllegro, + AutoencoderKLCogVideoX, + AutoencoderKLCosmos, + AutoencoderKLHunyuanVideo, + AutoencoderKLLTXVideo, + AutoencoderKLMagvit, + AutoencoderKLMochi, + AutoencoderKLTemporalDecoder, + AutoencoderKLWan, + AutoencoderOobleck, + AutoencoderTiny, + AutoModel, + CacheMixin, + CogVideoXTransformer3DModel, + CogView3PlusTransformer2DModel, + CogView4Transformer2DModel, + ConsisIDTransformer3DModel, + ConsistencyDecoderVAE, + ControlNetModel, + ControlNetUnionModel, + ControlNetXSAdapter, + CosmosTransformer3DModel, + DiTTransformer2DModel, + EasyAnimateTransformer3DModel, + FluxControlNetModel, + FluxMultiControlNetModel, + FluxTransformer2DModel, + HiDreamImageTransformer2DModel, + HunyuanDiT2DControlNetModel, + HunyuanDiT2DModel, + HunyuanDiT2DMultiControlNetModel, + HunyuanVideoFramepackTransformer3DModel, + HunyuanVideoTransformer3DModel, + I2VGenXLUNet, + Kandinsky3UNet, + LatteTransformer3DModel, + LTXVideoTransformer3DModel, + Lumina2Transformer2DModel, + LuminaNextDiT2DModel, + MochiTransformer3DModel, + ModelMixin, + MotionAdapter, + MultiAdapter, + MultiControlNetModel, + OmniGenTransformer2DModel, + PixArtTransformer2DModel, + PriorTransformer, + SanaControlNetModel, + SanaTransformer2DModel, + SD3ControlNetModel, + SD3MultiControlNetModel, + SD3Transformer2DModel, + SparseControlNetModel, + StableAudioDiTModel, + T2IAdapter, + T5FilmDecoder, + Transformer2DModel, + TransformerTemporalModel, + UNet1DModel, + UNet2DConditionModel, + UNet2DModel, + UNet3DConditionModel, + UNetControlNetXSModel, + UNetMotionModel, + UNetSpatioTemporalConditionModel, + UVit2DModel, + VQModel, + WanTransformer3DModel, + ) + from .optimization import ( + get_constant_schedule, + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, + get_linear_schedule_with_warmup, + get_polynomial_decay_schedule_with_warmup, + get_scheduler, + ) + from .pipelines import ( + AudioPipelineOutput, AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - ) - from .consistency_models import ConsistencyModelPipeline - from .dance_diffusion import DanceDiffusionPipeline - from .ddim import DDIMPipeline - from .ddpm import DDPMPipeline - from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline - from .dit import DiTPipeline - from .latent_diffusion import LDMSuperResolutionPipeline - from .photodoodle import PhotoDoodlePipeline - from .pipeline_utils import ( - AudioPipelineOutput, + BlipDiffusionControlNetPipeline, + BlipDiffusionPipeline, + CLIPImageProjection, + ConsistencyModelPipeline, + DanceDiffusionPipeline, + DDIMPipeline, + DDPMPipeline, DiffusionPipeline, + DiTPipeline, ImagePipelineOutput, + KarrasVePipeline, + LDMPipeline, + LDMSuperResolutionPipeline, + PNDMPipeline, + RePaintPipeline, + ScoreSdeVePipeline, StableDiffusionMixin, ) + from .quantizers import DiffusersQuantizer + from .schedulers import ( + AmusedScheduler, + CMStochasticIterativeScheduler, + CogVideoXDDIMScheduler, + CogVideoXDPMScheduler, + DDIMInverseScheduler, + DDIMParallelScheduler, + DDIMScheduler, + DDPMParallelScheduler, + DDPMScheduler, + DDPMWuerstchenScheduler, + DEISMultistepScheduler, + DPMSolverMultistepInverseScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EDMDPMSolverMultistepScheduler, + EDMEulerScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + FlowMatchEulerDiscreteScheduler, + FlowMatchHeunDiscreteScheduler, + FlowMatchLCMScheduler, + HeunDiscreteScheduler, + IPNDMScheduler, + KarrasVeScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LCMScheduler, + PNDMScheduler, + RePaintScheduler, + SASolverScheduler, + SchedulerMixin, + SCMScheduler, + ScoreSdeVeScheduler, + TCDScheduler, + UnCLIPScheduler, + UniPCMultistepScheduler, + VQDiffusionScheduler, + ) + from .training_utils import EMAModel try: - if not (is_torch_available() and is_librosa_available()): + if not (is_torch_available() and is_scipy_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_librosa_objects import * + from .utils.dummy_torch_and_scipy_objects import * # noqa F403 else: - from .deprecated import AudioDiffusionPipeline, Mel + from .schedulers import LMSDiscreteScheduler + + try: + if not (is_torch_available() and is_torchsde_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_torchsde_objects import * # noqa F403 + else: + from .schedulers import CosineDPMSolverMultistepScheduler, DPMSolverSDEScheduler try: if not (is_torch_available() and is_transformers_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_transformers_objects import * + from .utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .allegro import AllegroPipeline - from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline - from .animatediff import ( + from .pipelines import ( + AllegroPipeline, + AltDiffusionImg2ImgPipeline, + AltDiffusionPipeline, + AmusedImg2ImgPipeline, + AmusedInpaintPipeline, + AmusedPipeline, AnimateDiffControlNetPipeline, + AnimateDiffPAGPipeline, AnimateDiffPipeline, + PhotoDoodlePipeline, AnimateDiffSDXLPipeline, AnimateDiffSparseControlNetPipeline, AnimateDiffVideoToVideoControlNetPipeline, AnimateDiffVideoToVideoPipeline, - ) - from .audioldm import AudioLDMPipeline - from .audioldm2 import ( AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel, - ) - from .aura_flow import AuraFlowPipeline - from .blip_diffusion import BlipDiffusionPipeline - from .cogvideo import ( + AudioLDMPipeline, + AuraFlowPipeline, + CLIPImageProjection, CogVideoXFunControlPipeline, CogVideoXImageToVideoPipeline, CogVideoXPipeline, CogVideoXVideoToVideoPipeline, - ) - from .cogview3 import CogView3PlusPipeline - from .cogview4 import CogView4ControlPipeline, CogView4Pipeline - from .controlnet import ( - BlipDiffusionControlNetPipeline, - StableDiffusionControlNetImg2ImgPipeline, - StableDiffusionControlNetInpaintPipeline, - StableDiffusionControlNetPipeline, - StableDiffusionXLControlNetImg2ImgPipeline, - StableDiffusionXLControlNetInpaintPipeline, - StableDiffusionXLControlNetPipeline, - StableDiffusionXLControlNetUnionImg2ImgPipeline, - StableDiffusionXLControlNetUnionInpaintPipeline, - StableDiffusionXLControlNetUnionPipeline, - ) - from .controlnet_hunyuandit import ( - HunyuanDiTControlNetPipeline, - ) - from .controlnet_sd3 import StableDiffusion3ControlNetInpaintingPipeline, StableDiffusion3ControlNetPipeline - from .controlnet_xs import ( - StableDiffusionControlNetXSPipeline, - StableDiffusionXLControlNetXSPipeline, - ) - from .cosmos import CosmosTextToWorldPipeline, CosmosVideoToWorldPipeline - from .deepfloyd_if import ( - IFImg2ImgPipeline, - IFImg2ImgSuperResolutionPipeline, - IFInpaintingPipeline, - IFInpaintingSuperResolutionPipeline, - IFPipeline, - IFSuperResolutionPipeline, - ) - from .deprecated import ( - AltDiffusionImg2ImgPipeline, - AltDiffusionPipeline, + CogView3PlusPipeline, + CogView4ControlPipeline, + CogView4Pipeline, + ConsisIDPipeline, + CosmosTextToWorldPipeline, + CosmosVideoToWorldPipeline, CycleDiffusionPipeline, - StableDiffusionInpaintPipelineLegacy, - StableDiffusionModelEditingPipeline, - StableDiffusionParadigmsPipeline, - StableDiffusionPix2PixZeroPipeline, - VersatileDiffusionDualGuidedPipeline, - VersatileDiffusionImageVariationPipeline, - VersatileDiffusionPipeline, - VersatileDiffusionTextToImagePipeline, - VQDiffusionPipeline, - ) - from .easyanimate import ( EasyAnimateControlPipeline, EasyAnimateInpaintPipeline, EasyAnimatePipeline, - ) - from .flux import ( FluxControlImg2ImgPipeline, FluxControlInpaintPipeline, FluxControlNetImg2ImgPipeline, @@ -603,18 +965,24 @@ FluxInpaintPipeline, FluxPipeline, FluxPriorReduxPipeline, - ReduxImageEncoder, - ) - from .hidream_image import HiDreamImagePipeline - from .hunyuan_video import ( + HiDreamImagePipeline, + HunyuanDiTControlNetPipeline, + HunyuanDiTPAGPipeline, + HunyuanDiTPipeline, HunyuanSkyreelsImageToVideoPipeline, HunyuanVideoFramepackPipeline, HunyuanVideoImageToVideoPipeline, HunyuanVideoPipeline, - ) - from .hunyuandit import HunyuanDiTPipeline - from .i2vgen_xl import I2VGenXLPipeline - from .kandinsky import ( + I2VGenXLPipeline, + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, + ImageTextPipelineOutput, + Kandinsky3Img2ImgPipeline, + Kandinsky3Pipeline, KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyImg2ImgPipeline, @@ -622,8 +990,6 @@ KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, - ) - from .kandinsky2_2 import ( KandinskyV22CombinedPipeline, KandinskyV22ControlnetImg2ImgPipeline, KandinskyV22ControlnetPipeline, @@ -634,210 +1000,225 @@ KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline, KandinskyV22PriorPipeline, - ) - from .kandinsky3 import ( - Kandinsky3Img2ImgPipeline, - Kandinsky3Pipeline, - ) - from .latent_consistency_models import ( LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, - ) - from .latent_diffusion import LDMTextToImagePipeline - from .latte import LattePipeline - from .ledits_pp import ( - LEditsPPDiffusionPipelineOutput, - LEditsPPInversionPipelineOutput, + LattePipeline, + LDMTextToImagePipeline, LEditsPPPipelineStableDiffusion, LEditsPPPipelineStableDiffusionXL, - ) - from .ltx import LTXConditionPipeline, LTXImageToVideoPipeline, LTXLatentUpsamplePipeline, LTXPipeline - from .lumina import LuminaPipeline, LuminaText2ImgPipeline - from .lumina2 import Lumina2Pipeline, Lumina2Text2ImgPipeline - from .marigold import ( + LTXConditionPipeline, + LTXImageToVideoPipeline, + LTXLatentUpsamplePipeline, + LTXPipeline, + Lumina2Pipeline, + Lumina2Text2ImgPipeline, + LuminaPipeline, + LuminaText2ImgPipeline, MarigoldDepthPipeline, MarigoldIntrinsicsPipeline, MarigoldNormalsPipeline, - ) - from .mochi import MochiPipeline - from .musicldm import MusicLDMPipeline - from .omnigen import OmniGenPipeline - from .pag import ( - AnimateDiffPAGPipeline, - HunyuanDiTPAGPipeline, - KolorsPAGPipeline, + MochiPipeline, + MusicLDMPipeline, + OmniGenPipeline, + PaintByExamplePipeline, + PIAPipeline, + PixArtAlphaPipeline, PixArtSigmaPAGPipeline, + PixArtSigmaPipeline, + ReduxImageEncoder, + SanaControlNetPipeline, SanaPAGPipeline, + SanaPipeline, + SanaSprintImg2ImgPipeline, + SanaSprintPipeline, + SemanticStableDiffusionPipeline, + ShapEImg2ImgPipeline, + ShapEPipeline, + StableAudioPipeline, + StableAudioProjectionModel, + StableCascadeCombinedPipeline, + StableCascadeDecoderPipeline, + StableCascadePriorPipeline, + StableDiffusion3ControlNetInpaintingPipeline, + StableDiffusion3ControlNetPipeline, + StableDiffusion3Img2ImgPipeline, + StableDiffusion3InpaintPipeline, StableDiffusion3PAGImg2ImgPipeline, StableDiffusion3PAGPipeline, + StableDiffusion3Pipeline, + StableDiffusionAdapterPipeline, + StableDiffusionAttendAndExcitePipeline, + StableDiffusionControlNetImg2ImgPipeline, + StableDiffusionControlNetInpaintPipeline, StableDiffusionControlNetPAGInpaintPipeline, StableDiffusionControlNetPAGPipeline, - StableDiffusionPAGImg2ImgPipeline, - StableDiffusionPAGInpaintPipeline, - StableDiffusionPAGPipeline, - StableDiffusionXLControlNetPAGImg2ImgPipeline, - StableDiffusionXLControlNetPAGPipeline, - StableDiffusionXLPAGImg2ImgPipeline, - StableDiffusionXLPAGInpaintPipeline, - StableDiffusionXLPAGPipeline, - ) - from .paint_by_example import PaintByExamplePipeline - from .pia import PIAPipeline - from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline - from .sana import SanaControlNetPipeline, SanaPipeline, SanaSprintImg2ImgPipeline, SanaSprintPipeline - from .semantic_stable_diffusion import SemanticStableDiffusionPipeline - from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline - from .stable_audio import StableAudioPipeline, StableAudioProjectionModel - from .stable_cascade import ( - StableCascadeCombinedPipeline, - StableCascadeDecoderPipeline, - StableCascadePriorPipeline, - ) - from .stable_diffusion import ( - CLIPImageProjection, + StableDiffusionControlNetPipeline, + StableDiffusionControlNetXSPipeline, StableDiffusionDepth2ImgPipeline, + StableDiffusionDiffEditPipeline, + StableDiffusionGLIGENPipeline, + StableDiffusionGLIGENTextImagePipeline, StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, + StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, StableDiffusionLatentUpscalePipeline, + StableDiffusionLDM3DPipeline, + StableDiffusionModelEditingPipeline, + StableDiffusionPAGImg2ImgPipeline, + StableDiffusionPAGInpaintPipeline, + StableDiffusionPAGPipeline, + StableDiffusionPanoramaPipeline, + StableDiffusionParadigmsPipeline, StableDiffusionPipeline, + StableDiffusionPipelineSafe, + StableDiffusionPix2PixZeroPipeline, + StableDiffusionSAGPipeline, StableDiffusionUpscalePipeline, - StableUnCLIPImg2ImgPipeline, - StableUnCLIPPipeline, - ) - from .stable_diffusion_3 import ( - StableDiffusion3Img2ImgPipeline, - StableDiffusion3InpaintPipeline, - StableDiffusion3Pipeline, - ) - from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline - from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline - from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline - from .stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline - from .stable_diffusion_panorama import StableDiffusionPanoramaPipeline - from .stable_diffusion_safe import StableDiffusionPipelineSafe - from .stable_diffusion_sag import StableDiffusionSAGPipeline - from .stable_diffusion_xl import ( + StableDiffusionXLAdapterPipeline, + StableDiffusionXLControlNetImg2ImgPipeline, + StableDiffusionXLControlNetInpaintPipeline, + StableDiffusionXLControlNetPAGImg2ImgPipeline, + StableDiffusionXLControlNetPAGPipeline, + StableDiffusionXLControlNetPipeline, + StableDiffusionXLControlNetUnionImg2ImgPipeline, + StableDiffusionXLControlNetUnionInpaintPipeline, + StableDiffusionXLControlNetUnionPipeline, + StableDiffusionXLControlNetXSPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLInstructPix2PixPipeline, + StableDiffusionXLPAGImg2ImgPipeline, + StableDiffusionXLPAGInpaintPipeline, + StableDiffusionXLPAGPipeline, StableDiffusionXLPipeline, - ) - from .stable_video_diffusion import StableVideoDiffusionPipeline - from .t2i_adapter import ( - StableDiffusionAdapterPipeline, - StableDiffusionXLAdapterPipeline, - ) - from .text_to_video_synthesis import ( + StableUnCLIPImg2ImgPipeline, + StableUnCLIPPipeline, + StableVideoDiffusionPipeline, TextToVideoSDPipeline, TextToVideoZeroPipeline, TextToVideoZeroSDXLPipeline, - VideoToVideoSDPipeline, - ) - from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline - from .unidiffuser import ( - ImageTextPipelineOutput, + UnCLIPImageVariationPipeline, + UnCLIPPipeline, UniDiffuserModel, UniDiffuserPipeline, UniDiffuserTextDecoder, - ) - from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline - from .wan import WanImageToVideoPipeline, WanPipeline, WanVideoToVideoPipeline - from .wuerstchen import ( + VersatileDiffusionDualGuidedPipeline, + VersatileDiffusionImageVariationPipeline, + VersatileDiffusionPipeline, + VersatileDiffusionTextToImagePipeline, + VideoToVideoSDPipeline, + VisualClozeGenerationPipeline, + VisualClozePipeline, + VQDiffusionPipeline, + WanImageToVideoPipeline, + WanPipeline, + WanVideoToVideoPipeline, WuerstchenCombinedPipeline, WuerstchenDecoderPipeline, WuerstchenPriorPipeline, ) - try: - if not is_onnx_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_onnx_objects import * # noqa F403 - - else: - from .onnx_utils import OnnxRuntimeModel - - try: - if not (is_torch_available() and is_transformers_available() and is_onnx_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_transformers_and_onnx_objects import * - else: - from .stable_diffusion import ( - OnnxStableDiffusionImg2ImgPipeline, - OnnxStableDiffusionInpaintPipeline, - OnnxStableDiffusionPipeline, - OnnxStableDiffusionUpscalePipeline, - StableDiffusionOnnxPipeline, - ) - - try: - if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import * - else: - from .stable_diffusion_k_diffusion import ( - StableDiffusionKDiffusionPipeline, - StableDiffusionXLKDiffusionPipeline, - ) - - try: - if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_transformers_and_sentencepiece_objects import * - else: - from .kolors import ( - KolorsImg2ImgPipeline, - KolorsPipeline, - ) - - try: - if not (is_torch_available() and is_transformers_available() and is_opencv_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_torch_and_transformers_and_opencv_objects import * - else: - from .consisid import ConsisIDPipeline - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_flax_objects import * # noqa F403 - else: - from .pipeline_flax_utils import FlaxDiffusionPipeline - - try: - if not (is_flax_available() and is_transformers_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_flax_and_transformers_objects import * - else: - from .controlnet import FlaxStableDiffusionControlNetPipeline - from .stable_diffusion import ( - FlaxStableDiffusionImg2ImgPipeline, - FlaxStableDiffusionInpaintPipeline, - FlaxStableDiffusionPipeline, - ) - from .stable_diffusion_xl import ( - FlaxStableDiffusionXLPipeline, - ) - - try: - if not (is_transformers_available() and is_torch_available() and is_note_seq_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ..utils.dummy_transformers_and_torch_and_note_seq_objects import * # noqa F403 - - else: - from .deprecated import ( - MidiProcessor, - SpectrogramDiffusionPipeline, - ) + try: + if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import * # noqa F403 + else: + from .pipelines import StableDiffusionKDiffusionPipeline, StableDiffusionXLKDiffusionPipeline + + try: + if not (is_torch_available() and is_transformers_available() and is_sentencepiece_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_transformers_and_sentencepiece_objects import * # noqa F403 + else: + from .pipelines import KolorsImg2ImgPipeline, KolorsPAGPipeline, KolorsPipeline + + try: + if not (is_torch_available() and is_transformers_available() and is_opencv_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_transformers_and_opencv_objects import * # noqa F403 + else: + from .pipelines import ConsisIDPipeline + + try: + if not (is_torch_available() and is_transformers_available() and is_onnx_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403 + else: + from .pipelines import ( + OnnxStableDiffusionImg2ImgPipeline, + OnnxStableDiffusionInpaintPipeline, + OnnxStableDiffusionInpaintPipelineLegacy, + OnnxStableDiffusionPipeline, + OnnxStableDiffusionUpscalePipeline, + StableDiffusionOnnxPipeline, + ) + + try: + if not (is_torch_available() and is_librosa_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_librosa_objects import * # noqa F403 + else: + from .pipelines import AudioDiffusionPipeline, Mel + + try: + if not (is_transformers_available() and is_torch_available() and is_note_seq_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_transformers_and_torch_and_note_seq_objects import * # noqa F403 + else: + from .pipelines import SpectrogramDiffusionPipeline + + try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_flax_objects import * # noqa F403 + else: + from .models.controlnets.controlnet_flax import FlaxControlNetModel + from .models.modeling_flax_utils import FlaxModelMixin + from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel + from .models.vae_flax import FlaxAutoencoderKL + from .pipelines import FlaxDiffusionPipeline + from .schedulers import ( + FlaxDDIMScheduler, + FlaxDDPMScheduler, + FlaxDPMSolverMultistepScheduler, + FlaxEulerDiscreteScheduler, + FlaxKarrasVeScheduler, + FlaxLMSDiscreteScheduler, + FlaxPNDMScheduler, + FlaxSchedulerMixin, + FlaxScoreSdeVeScheduler, + ) + + try: + if not (is_flax_available() and is_transformers_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_flax_and_transformers_objects import * # noqa F403 + else: + from .pipelines import ( + FlaxStableDiffusionControlNetPipeline, + FlaxStableDiffusionImg2ImgPipeline, + FlaxStableDiffusionInpaintPipeline, + FlaxStableDiffusionPipeline, + FlaxStableDiffusionXLPipeline, + ) + + try: + if not (is_note_seq_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_note_seq_objects import * # noqa F403 + else: + from .pipelines import MidiProcessor else: import sys @@ -847,6 +1228,5 @@ globals()["__file__"], _import_structure, module_spec=__spec__, + extra_objects={"__version__": __version__}, ) - for name, value in _dummy_objects.items(): - setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py index d7b46d9cb7bb..47ce0f34fd69 100644 --- a/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py +++ b/src/diffusers/pipelines/photodoodle/pipeline_photodoodle.py @@ -33,6 +33,36 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import PhotoDoodlePipeline + >>> import torch + + >>> pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev",torch_dtype=torch.bfloat16)) + >>> pipeline = pipeline.to("cuda") + >>> # Load initial model weights + >>> pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors") + >>> pipeline.fuse_lora() + >>> pipeline.unload_lora_weights() + + >>> pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors") + + >>> # Generate image with text prompt and condition image + >>> prompt = "add a halo and wings for the cat by sksmagiceffects" + >>> condition_image = load_image("./sample.png") # PIL Image + >>> height=768 + >>> width=512 + >>> # Prepare the input image + >>> condition_image = condition_image.resize((height, width)).convert("RGB") + >>> output = pipeline(prompt=prompt, condition_image=condition_image,num_inference_steps=28,guidance_scale=3.5) + >>> # Save the generated image + >>> output.images[0].save("photodoodle_results.png") + ``` + +""" + + def calculate_shift( image_seq_len, base_seq_len: int = 256, @@ -47,8 +77,8 @@ def calculate_shift( def prepare_latent_image_ids_2(height, width, device, dtype): latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype) - latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None] # y coordinate - latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :] # x coordinate + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None] # y坐标 + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :] # x坐标 return latent_image_ids def position_encoding_clone(batch_size, original_height, original_width, device, dtype): @@ -61,6 +91,7 @@ def position_encoding_clone(batch_size, original_height, original_width, device, latent_image_ids = torch.concat([latent_image_ids, cond_latent_image_ids], dim=-2) return latent_image_ids +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents def retrieve_latents( encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" ): @@ -73,6 +104,8 @@ def retrieve_latents( else: raise AttributeError("Could not access latents of provided encoder_output") + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, num_inference_steps: Optional[int] = None, @@ -81,7 +114,29 @@ def retrieve_timesteps( sigmas: Optional[List[float]] = None, **kwargs, ): - """Retrieve timesteps from scheduler and handle custom timesteps/sigmas.""" + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: @@ -109,12 +164,12 @@ def retrieve_timesteps( timesteps = scheduler.timesteps return timesteps, num_inference_steps + class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin): r""" - PhotoDoodle pipeline for image generation. + The Flux pipeline for text-to-image generation. - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods - implemented for all pipelines. + Reference: https://blackforestlabs.ai/announcing-black-forest-labs/ Args: transformer ([`FluxTransformer2DModel`]): @@ -124,13 +179,17 @@ class PhotoDoodlePipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. text_encoder ([`CLIPTextModel`]): - Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. text_encoder_2 ([`T5EncoderModel`]): - Second frozen text encoder ([t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl)). + [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically + the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant. tokenizer (`CLIPTokenizer`): - A tokenizer for the text encoder. + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer). tokenizer_2 (`T5TokenizerFast`): - A tokenizer for the second text encoder. + Second Tokenizer of class + [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast). """ model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae" @@ -266,7 +325,6 @@ def encode_prompt( lora_scale: Optional[float] = None, ): r""" - Encodes the prompt into text encoder hidden states. Args: prompt (`str` or `List[str]`, *optional*): @@ -334,6 +392,101 @@ def encode_prompt( return prompt_embeds, pooled_prompt_embeds, text_ids + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(self.vae.encode(image), generator=generator) + + image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor + + return image_latents + + def check_inputs( + self, + prompt, + prompt_2, + height, + width, + prompt_embeds=None, + pooled_prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + max_sequence_length=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 512: + raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}") + + @staticmethod + def _prepare_latent_image_ids(batch_size, height, width, device, dtype): + latent_image_ids = torch.zeros(height // 2, width // 2, 3) + latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None] + latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :] + latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape + latent_image_ids = latent_image_ids.reshape( + latent_image_id_height * latent_image_id_width, latent_image_id_channels + ) + return latent_image_ids.to(device=device, dtype=dtype) + + @staticmethod + def _pack_latents(latents, batch_size, num_channels_latents, height, width): + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 2, 4, 1, 3, 5) + latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4) + return latents + + @staticmethod + def _unpack_latents(latents, height, width, vae_scale_factor): + batch_size, num_patches, channels = latents.shape + + height = height // vae_scale_factor + width = width // vae_scale_factor + + latents = latents.view(batch_size, height, width, channels // 4, 2, 2) + latents = latents.permute(0, 3, 1, 4, 2, 5) + + latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2) + + return latents + def enable_vae_slicing(self): r""" Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to @@ -403,6 +556,7 @@ def prepare_latents( else: image_latents = torch.cat([image_latents], dim=0) + # import pdb; pdb.set_trace() latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width) cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width) @@ -610,4 +764,4 @@ def __call__( if not return_dict: return (image,) - return FluxPipelineOutput(images=image) \ No newline at end of file + return FluxPipelineOutput(images=image) \ No newline at end of file From 442ad13d1853eabb3220707a2eafd648a30ac922 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 28 May 2025 13:04:15 +0530 Subject: [PATCH 4/4] PhotoDoodle by Ameer --- src/diffusers/pipelines/photodoodle/README.md | 66 ------------------- 1 file changed, 66 deletions(-) delete mode 100644 src/diffusers/pipelines/photodoodle/README.md diff --git a/src/diffusers/pipelines/photodoodle/README.md b/src/diffusers/pipelines/photodoodle/README.md deleted file mode 100644 index 67fcee3ae97f..000000000000 --- a/src/diffusers/pipelines/photodoodle/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# PhotoDoodle Pipeline - -The PhotoDoodle pipeline is designed for image generation with conditional image input. It uses a combination of text and image conditioning to generate high-quality images. - -## Model Architecture - -The pipeline uses the following components: - -1. **Transformer**: A FluxTransformer2DModel for denoising image latents -2. **VAE**: An AutoencoderKL for encoding/decoding images -3. **Text Encoders**: - - CLIP text encoder for initial text embedding - - T5 encoder for additional text understanding -4. **Scheduler**: FlowMatchEulerDiscreteScheduler for the diffusion process - -## Usage - -```python -from diffusers import PhotoDoodlePipeline -import torch - -pipeline = PhotoDoodlePipeline.from_pretrained("black-forest-labs/FLUX.1-dev") -pipeline = pipeline.to("cuda") -# Load initial model weights -pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle", weight_name="pretrain.safetensors") -pipeline.fuse_lora() -pipeline.unload_lora_weights() - -pipeline.load_lora_weights("nicolaus-huang/PhotoDoodle",weight_name="sksmagiceffects.safetensors") - -# Generate image with text prompt and condition image -prompt = "add a halo and wings for the cat by sksmagiceffects" -condition_image = load_image("path/to/condition.jpg") # PIL Image -output = pipeline( - prompt=prompt, - condition_image=condition_image, - num_inference_steps=28, - guidance_scale=3.5 -) - -# Save the generated image -output.images[0].save("generated_image.png") -``` - -## Parameters - -- `prompt`: Text prompt for image generation -- `prompt_2`: Optional secondary prompt for T5 encoder -- `condition_image`: Input image for conditioning -- `height`: Output image height (default: 512) -- `width`: Output image width (default: 512) -- `num_inference_steps`: Number of denoising steps (default: 28) -- `guidance_scale`: Classifier-free guidance scale (default: 3.5) -- `num_images_per_prompt`: Number of images to generate per prompt -- `generator`: Random number generator for reproducibility -- `output_type`: Output format ("pil", "latent", or "pt") - -## Features - -- Dual text encoder architecture (CLIP + T5) -- Image conditioning support -- Position encoding for better spatial understanding -- Support for LoRA fine-tuning -- VAE slicing and tiling for memory efficiency -- Progress bar during generation -- Callback support for step-by-step monitoring \ No newline at end of file