Skip to content
Open
Changes from 1 commit
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
17e2a42
npu attention enable ulysses
TmacAaron Nov 1, 2025
07ea078
[Modular]z-image (#12808)
yiyixuxu Dec 9, 2025
8b4722d
Fix Qwen Edit Plus modular for multi-image input (#12601)
sayakpaul Dec 9, 2025
be3c2a0
[WIP] Add Flux2 modular (#12763)
DN6 Dec 10, 2025
6708f5c
[docs] improve distributed inference cp docs. (#12810)
sayakpaul Dec 10, 2025
10e820a
post release 0.36.0 (#12804)
sayakpaul Dec 11, 2025
0eac64c
Update distributed_inference.md to correct syntax (#12827)
sayakpaul Dec 11, 2025
1567243
[lora] Remove lora docs unneeded and add " # Copied from ..." (#12824)
sayakpaul Dec 12, 2025
17c0e79
support CP in native flash attention (#12829)
sywangyi Dec 12, 2025
b8a4cba
[qwen-image] edit 2511 support (#12839)
naykun Dec 15, 2025
0c1ccc0
fix pytest tests/pipelines/pixart_sigma/test_pixart.py::PixArtSigmaPi…
sywangyi Dec 15, 2025
5851928
Support for control-lora (#10686)
lavinal712 Dec 15, 2025
a748a83
Add support for LongCat-Image (#12828)
junqiangwu Dec 15, 2025
5e48f46
fix the prefix_token_len bug (#12845)
junqiangwu Dec 16, 2025
87f7d11
extend TorchAoTest::test_model_memory_usage to other platform (#12768)
sywangyi Dec 17, 2025
f9c1e61
Qwen Image Layered Support (#12853)
naykun Dec 17, 2025
55463f7
Z-Image-Turbo ControlNet (#12792)
hlky Dec 17, 2025
b530968
Cosmos Predict2.5 Base: inference pipeline, scheduler & chkpt convers…
miguelmartin75 Dec 19, 2025
f7753b1
more update in modular (#12560)
yiyixuxu Dec 19, 2025
262ce19
Feature: Add Mambo-G Guidance as Guider (#12862)
MatrixTeam-AI Dec 19, 2025
0c4f6c9
Add `OvisImagePipeline` in `AUTO_TEXT2IMAGE_PIPELINES_MAPPING` (#12876)
alvarobartt Dec 22, 2025
973a077
Cosmos Predict2.5 14b Conversion (#12863)
miguelmartin75 Dec 22, 2025
52766e6
Use `T5Tokenizer` instead of `MT5Tokenizer` (removed in Transformers …
alvarobartt Dec 23, 2025
f6b6a71
Add z-image-omni-base implementation (#12857)
RuoyiDu Dec 24, 2025
1cdb872
fix torchao quantizer for new torchao versions (#12901)
vkuzo Dec 30, 2025
208cda8
fix Qwen Image Transformer single file loading mapping function to be…
mbalabanski Jan 2, 2026
4737806
Z-Image-Turbo from_single_file fix (#12888)
hlky Jan 2, 2026
d0ae34d
chore: fix dev version in setup.py (#12904)
DefTruth Jan 5, 2026
5ffb658
Community Pipeline: Add z-image differential img2img (#12882)
r4inm4ker Jan 5, 2026
0da1aa9
Fix typo in src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict…
miguelmartin75 Jan 6, 2026
3138e37
Fix wan 2.1 i2v context parallel (#12909)
DefTruth Jan 6, 2026
7c6d314
fix the use of device_map in CP docs (#12902)
sayakpaul Jan 6, 2026
b6098ca
[core] remove unneeded autoencoder methods when subclassing from `Aut…
sayakpaul Jan 6, 2026
88ffb00
Detect 2.0 vs 2.1 ZImageControlNetModel (#12861)
hlky Jan 6, 2026
db37140
Refactor environment variable assignments in workflow (#12916)
paulinebm Jan 6, 2026
e46354d
Add codeQL workflow (#12917)
paulinebm Jan 6, 2026
417f6b2
Delete .github/workflows/codeql.yml
paulinebm Jan 6, 2026
9b5a244
CodeQL workflow for security analysis
paulinebm Jan 6, 2026
41a6e86
Check for attention mask in backends that don't support it (#12892)
dxqb Jan 6, 2026
ade1059
[Flux.1] improve pos embed for ascend npu by computing on npu (#12897)
zhangtao0408 Jan 6, 2026
98479a9
LTX Video 0.9.8 long multi prompt (#12614)
yaoqih Jan 7, 2026
b4be29b
Add FSDP option for Flux2 (#12860)
leisuzz Jan 7, 2026
a033e7f
clean the format
TmacAaron Jan 7, 2026
8f30bff
Add transformer cache context for SkyReels-V2 pipelines & Update docs…
tolgacangoz Jan 7, 2026
961b9b2
[docs] fix torchao typo. (#12883)
sayakpaul Jan 7, 2026
6fb4c99
Update wan.md to remove unneeded hfoptions (#12890)
sayakpaul Jan 7, 2026
9fb6b89
Improve docstrings and type hints in scheduling_edm_euler.py (#12871)
delmalih Jan 7, 2026
dab000e
[Modular] Video for Mellon (#12924)
asomoza Jan 7, 2026
c10bdd9
Add LTX 2.0 Video Pipelines (#12915)
dg845 Jan 8, 2026
79c1107
Merge branch 'main' into npu_ulysses
TmacAaron Jan 8, 2026
8600b4c
Add environment variables to checkout step (#12927)
paulinebm Jan 8, 2026
002e7ef
register _native_npu_attention to _supports_context_parallel
TmacAaron Jan 8, 2026
9a5e827
change npu_fusion_attention's input_layout to BSND to eliminate redun…
TmacAaron Jan 8, 2026
51ba43c
Merge branch 'main' into npu_ulysses
sayakpaul Jan 8, 2026
8780c4a
Update format
TmacAaron Jan 8, 2026
b1f06b7
Improve docstrings and type hints in scheduling_consistency_decoder.p…
delmalih Jan 8, 2026
8b9f817
Fix: Remove hardcoded CUDA autocast in Kandinsky 5 to fix import warn…
adi776borate Jan 8, 2026
a812c87
Upgrade GitHub Actions for Node 24 compatibility (#12865)
salmanmkc Jan 9, 2026
91e5134
fix the warning torch_dtype is deprecated (#12841)
msdsm Jan 9, 2026
10dec67
Merge branch 'main' into npu_ulysses
sayakpaul Jan 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 99 additions & 16 deletions src/diffusers/models/attention_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,6 +893,72 @@ def _sage_attention_backward_op(
raise NotImplementedError("Backward pass is not implemented for Sage attention.")


def _npu_attention_forward_op(
ctx: torch.autograd.function.FunctionCtx,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
dropout_p: float = 0.0,
is_causal: bool = False,
scale: Optional[float] = None,
enable_gqa: bool = False,
return_lse: bool = False,
_save_ctx: bool = True,
_parallel_config: Optional["ParallelConfig"] = None,
):
# if enable_gqa:
# raise ValueError("`enable_gqa` is not yet supported for cuDNN attention.")
if return_lse:
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")

# tensors_to_save = ()

# Contiguous is a must here! Calling cuDNN backend with aten ops produces incorrect results
# if the input tensors are not contiguous.
query = query.transpose(1, 2).contiguous()
key = key.transpose(1, 2).contiguous()
value = value.transpose(1, 2).contiguous()
# tensors_to_save += (query, key, value)

out = npu_fusion_attention(
query,
key,
value,
query.size(1), # num_heads
input_layout="BNSD",
pse=None,
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
pre_tockens=65536,
next_tockens=65536,
keep_prob=1.0 - dropout_p,
sync=False,
inner_precise=0,
)[0]

# tensors_to_save += (out)
# if _save_ctx:
# ctx.save_for_backward(*tensors_to_save)
# ctx.dropout_p = dropout_p
# ctx.is_causal = is_causal
# ctx.scale = scale
# ctx.attn_mask = attn_mask

out = out.transpose(1, 2).contiguous()
return out


# backward declaration:
# aten::_scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
def _npu_attention_backward_op(
ctx: torch.autograd.function.FunctionCtx,
grad_out: torch.Tensor,
*args,
**kwargs,
):
raise NotImplementedError("Backward pass is not implemented for Npu Fusion Attention.")


# ===== Context parallel =====


Expand Down Expand Up @@ -1722,22 +1788,39 @@ def _native_npu_attention(
) -> torch.Tensor:
if return_lse:
raise ValueError("NPU attention backend does not support setting `return_lse=True`.")
query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
out = npu_fusion_attention(
query,
key,
value,
query.size(1), # num_heads
input_layout="BNSD",
pse=None,
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
pre_tockens=65536,
next_tockens=65536,
keep_prob=1.0 - dropout_p,
sync=False,
inner_precise=0,
)[0]
out = out.transpose(1, 2).contiguous()
if _parallel_config is None:
query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
out = npu_fusion_attention(
query,
key,
value,
query.size(1), # num_heads
input_layout="BNSD",
# input_layout="BSND",
pse=None,
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
pre_tockens=65536,
next_tockens=65536,
keep_prob=1.0 - dropout_p,
sync=False,
inner_precise=0,
)[0]
out = out.transpose(1, 2).contiguous()
else:
out = _templated_context_parallel_attention(
query,
key,
value,
None,
dropout_p,
None,
scale,
None,
return_lse,
forward_op=_npu_attention_forward_op,
backward_op=_npu_attention_backward_op,
_parallel_config=_parallel_config,
)
return out


Expand Down