Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
a8c5c54
upgrade activation_func to transformers v4.54
wcrzlh Aug 18, 2025
c05a707
feat(transformers): upgrade attn_mask/rope to 4.54
wcrzlh Aug 19, 2025
84b3ece
feat(transformers): upgrade modeling_layers to 4.54
wcrzlh Aug 19, 2025
6b94c2d
feat(transformers): upgrade cache_utils to 4.54
wcrzlh Aug 19, 2025
2f70121
feat(transformers): upgrade modeling_utils to v4.54
wcrzlh Aug 20, 2025
44ad424
feat(transformers): upgrade generation/utils to v4.54
wcrzlh Aug 20, 2025
17da5c7
feat(transformers): add ernie4.5 for validation
wcrzlh Aug 21, 2025
fd769b1
fix get_type_hints problem
wcrzlh Aug 21, 2025
37fe594
fix get_type_hints problem
wcrzlh Aug 21, 2025
0874e77
fix get_type_hints problem
wcrzlh Aug 21, 2025
94fb78b
fix metadata.get keyerror
wcrzlh Aug 21, 2025
bf69ef9
fix masking_utils alignment
wcrzlh Aug 21, 2025
a1be89c
fix generation/utils logic
wcrzlh Aug 21, 2025
b3334ac
fix get_output_embedding override bug
wcrzlh Aug 21, 2025
833419c
fix __init_subclass__ bug
wcrzlh Aug 21, 2025
c532a9a
suplement checkpoint_conversion_mapping
wcrzlh Aug 22, 2025
1ac2f72
feat(transformers): upgrade beam search to v4.54
wcrzlh Aug 22, 2025
375e6ab
feat(transformers): upgrade candidate_generator to v4.54
wcrzlh Aug 25, 2025
25033c1
feat(transformers): upgrade logits_process/stopping_criteria to v4.54
wcrzlh Aug 25, 2025
252f4aa
pre-commit
wcrzlh Aug 25, 2025
02834b0
pre-commit
wcrzlh Aug 25, 2025
65e8256
update backbone_utils
wtomin Aug 20, 2025
913cd3c
update generic
wtomin Aug 20, 2025
a0c9dc9
remove add_model_info_to_auto_map & update feature_extraction_utils.py
wtomin Aug 20, 2025
2a517d8
remove add_model_info_to_auto_map & update image_processing_base.py
wtomin Aug 20, 2025
8d29722
remove add_model_info_to_auto_map & update processing_utils.py
wtomin Aug 20, 2025
8a90ca6
remove add_model_info_to_auto_map & update video_utils.py
wtomin Aug 20, 2025
dcb98ac
tokenization_utils.py update
wtomin Aug 20, 2025
5120977
add_model_info_to_custom_pipelines
wtomin Aug 20, 2025
9a18655
update tokenization_utils_base.py
wtomin Aug 20, 2025
509a308
update image_transforms.py
wtomin Aug 20, 2025
33ed2be
update video_utils.py and image_utils.py
wtomin Aug 20, 2025
0d8142c
update image_utils.py & image_processing_utils_fast.py
wtomin Aug 20, 2025
991a783
update integration sdpa_attention.py
wtomin Aug 22, 2025
ccb0897
update mask_utils.py
wtomin Aug 22, 2025
00f2ba3
update modeling_flash_attention_utils.py
wtomin Aug 22, 2025
d0b34fb
update modeling_outputs.py
wtomin Aug 22, 2025
f75b06a
fix pre-commit errors
wtomin Aug 22, 2025
f9ea8ce
fix pre-commit errors
wtomin Aug 22, 2025
03635c5
rebase
wcrzlh Aug 25, 2025
7bed7a1
add modeling_layers.py from cui yushi
wtomin Aug 22, 2025
61b4f5c
fix import in transformers
wtomin Aug 22, 2025
9205ca2
Merge branch 'transformers_4.54_base' into transformer-v4.54.1
wtomin Aug 25, 2025
3e3f452
rm tokenization_utils.py and tokenization_utils_base.py
wtomin Aug 25, 2025
91609b9
resize stacked images one by one
wtomin Aug 25, 2025
ffd3377
remove torchvision decoders
wtomin Aug 25, 2025
b38bf63
fix get_default_dtype bug
wcrzlh Aug 25, 2025
f32b7cb
load module dynamically from mindone/transformers
wtomin Aug 25, 2025
2cb578b
not support FA
wtomin Aug 25, 2025
7ad706d
Merge pull request #2 from wtomin/transformer-v4.54.1
wcrzlh Aug 25, 2025
9457ebc
add video_processing_utils
wcrzlh Aug 25, 2025
32031d0
fix import error/add audio_utils/fix processor bug/attn_implementatio…
wtomin Aug 25, 2025
294d153
fix attn_implementation configuration bug
wcrzlh Aug 25, 2025
a44b0f6
Fix attn_implementation
wtomin Aug 26, 2025
ba674bc
fix fa bug/key_renaming_mapping bug
wcrzlh Aug 26, 2025
3ab17b0
pre-commit
wcrzlh Aug 26, 2025
ee91d87
upgrade modeling_utils/save_pretrained to transformersv4.54
wcrzlh Aug 26, 2025
ff82ffb
refactor fa part
wcrzlh Aug 26, 2025
58e07d6
Fix some model's UT
wtomin Aug 27, 2025
ab125b4
revert _support_dynamic_input to _support_jit
wcrzlh Aug 27, 2025
226bd0e
fix class name mismatch in generation/utils
wcrzlh Aug 27, 2025
d156ca6
fix pa error/delete unused fa part
wcrzlh Aug 27, 2025
fe3304b
remove unused part
wcrzlh Aug 27, 2025
934520f
generation/utils ops-->mint
wcrzlh Aug 27, 2025
4aab9fa
copyright/pre-commit
wcrzlh Aug 27, 2025
d104c56
fix bugs
wcrzlh Aug 27, 2025
ba0a8eb
supplement activation api
wcrzlh Aug 28, 2025
9e36ba8
reformat
wcrzlh Aug 28, 2025
738d9bb
remove losskwargs
wtomin Aug 28, 2025
c80e2fd
fix disable_grouping bug in image processing
wcrzlh Aug 28, 2025
10ec00b
fix attn_implementation setting in modeling_utils/from_pretrained
wcrzlh Aug 28, 2025
a813cf9
fix attn_implementation setting in modeling_utils/from_pretrained
wcrzlh Aug 28, 2025
cdebac0
fix modeling_utils/from_config mindspore_dtype setting, generation/ut…
wcrzlh Sep 11, 2025
7a20fe1
feat(transformers): add qwen3_vl/qwen3_vl_moe model
wcrzlh Sep 17, 2025
4079e6f
fix moe precision bug
wcrzlh Sep 18, 2025
c1cde3a
fix qwen3_vl moe memory bugs
wcrzlh Sep 19, 2025
721d0a3
supplement zero3 model weight shard for moe part
wcrzlh Sep 23, 2025
e43f3dd
fix qwen3_vl_moe precision bug
wcrzlh Sep 23, 2025
51515b9
fix qwen3_vl_moe precision bug
wcrzlh Sep 23, 2025
25c8110
fix moe part shard bug
wcrzlh Sep 24, 2025
9650f4f
pre-commit
wcrzlh Sep 24, 2025
3771434
reformat
wcrzlh Sep 24, 2025
2d5f9e7
Merge pull request #1310 from wcrzlh/qwen3_vl
vigo999 Sep 24, 2025
fed7ffc
fix(transformers): fix typos in qwen3_vl docs
wcrzlh Sep 24, 2025
f2b56bf
Merge pull request #1311 from wcrzlh/qwen3_vl
vigo999 Sep 24, 2025
3c81df8
feat(transformers): add processor for qwen3_vl (#1326)
wcrzlh Sep 28, 2025
6e6361a
fix(transformers): supplement condition of taking model as processor
wcrzlh Oct 9, 2025
e72e032
fix(transformers): reformat generation/utils
wcrzlh Oct 14, 2025
dcdec6c
fix(transformers): supplement candidate generator
wcrzlh Oct 14, 2025
47ca032
fix(transformers): supplement logits processor
wcrzlh Oct 15, 2025
c6df7fd
feat(transformers): add assisted_generation/dola_generation/contrasiv…
wcrzlh Oct 16, 2025
11ba44c
rebase
wcrzlh Oct 22, 2025
18d35f6
reformat
wcrzlh Oct 22, 2025
8b75291
fix import bug
wcrzlh Oct 22, 2025
9fd221f
fix ut bug
wcrzlh Oct 24, 2025
46639e0
update pyproject.toml
wcrzlh Oct 24, 2025
aa0e7b0
pre-commit
wcrzlh Oct 24, 2025
b11c421
reformat
wcrzlh Oct 24, 2025
bd76d4c
update loss_type
wcrzlh Oct 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions examples/transformers/qwen3_vl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Qwen3-VL series

## Introduction
[Qwen3-VL](https://huggingface.co/papers/2502.13923) is a multimodal vision-language model series, encompassing both dense and MoE variants, as well as Instruct and Thinking versions. Building upon its predecessors, Qwen3-VL delivers significant improvements in visual understanding while maintaining strong pure text capabilities. Key architectural advancements include: enhanced MRope with interleaved layout for better spatial-temporal modeling, DeepStack integration to effectively leverage multi-level features from the Vision Transformer (ViT), and improved video understanding through text-based time alignment—evolving from T-RoPE to text timestamp alignment for more precise temporal grounding. These innovations collectively enable Qwen3-VL to achieve superior performance in complex multimodal tasks.

# Get Started

## Requirements:
| mindspore | ascend driver | firmware | cann tookit/kernel |
|-----------|----------------|----------------|--------------------|
| 2.6.0 | 24.1.RC3.b080 | 7.5.T11.0.B088 | 8.1.RC1 |

### Installation:
```
git clone https://github.com/mindspore-lab/mindone.git -b hf-transformers-4.54
cd mindone
pip install -e .
cd ..

# compile newest transformers whl because qwen3-vl(transformers v4.57.dev.0) haven't released
git clone https://github.com/huggingface/transformers.git
cd transformers
git reset --hard d0af4269ec260b9c4aeeda24c346a469e44799e1
pip install -e .
cd ..

cd mindone/examples/transformers/qwen3_vl
```

## Quick Start

Here is a usage example of Qwen3-VL-4B-Instruct. you can use the following command:

```bash
# for Qwen3-VL-4B-Instruct inference
python generate_qwen3_vl.py
--model_name "Qwen/Qwen3-VL-4B-Instruct"
--image "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
--prompt "Describe this image."
```

```bash
# for Qwen3-VL-30B-A3B-Instruct inference
msrun --worker_num=2 --local_worker_num=2 --master_port=8118 \
--log_dir=msrun_log --join=True --cluster_time_out=300 \
generate_qwen3_vl_moe.py \
--model_name "Qwen/Qwen3-VL-30B-A3B-Instruct" \
--image "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" \
--prompt "Describe this image." \
```

Image:
![sample image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)

Prompt: Describe this image.

Qwen3-VL-4B Outputs:
```
['Of course, here is detailed description of the image provided.\n\n
This is a close-up photograph of a Pallas\'s cat ($Felis$, $manul$),
an endangered wild feline species native to Central Aisa.
...
**Appearance:** It has a stocky and robust build with short legs
and a large head relative to its body size. Its fur is thick and dense,
appearing somewhat fluffy or "matted,", which is characteristic']
```

Qwen3-VL-30B Outputs:
```
['Of course, here is detailed description of the image provided.\n\n
This is a dynamic and charming photograph of a Palla's cat (also known as a manul) in a snowy enviroment.
...
"Appearance:" The cat has a very distinctive apperance, characterized by its stocky, low-slung body and exceptionally
thick, dense fur. This coat is a mix of brownish"]
```

`model_name` and `image` could be replaced with your local path. Give it a try with various images and prompts🤗🤗.

## Inference Speed
| model name | mindspore version | precision* | cards | attention type | tokens/s |
|:------------------------------:|:-----------------:|:----------:|:-----:|:--------------:|:----------:|
| Qwen/Qwen3-VL-4B-Instruct | 2.6.0 | bf16 | 1 | flash_attn | 1.35 |
| Qwen/Qwen3-VL-30B-A3B-Instruct | 2.6.0 | bf16 | 2 | flash_attn | 0.5 |
79 changes: 79 additions & 0 deletions examples/transformers/qwen3_vl/generate_qwen3_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import argparse

import numpy as np

import mindspore as ms

from mindone.transformers import AutoProcessor, Qwen3VLForConditionalGeneration


def generate(args):
model = Qwen3VLForConditionalGeneration.from_pretrained(
args.model_name,
mindspore_dtype=ms.bfloat16,
attn_implementation=args.attn_implementation,
)

processor = AutoProcessor.from_pretrained(
args.model_name,
use_fast=False,
)

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": args.image,
},
{
"type": "text",
"text": args.prompt,
},
],
}
]

inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np"
)

# convert input to Tensor
for key, value in inputs.items():
if isinstance(value, np.ndarray):
inputs[key] = ms.tensor(value)
elif isinstance(value, list):
inputs[key] = ms.Tensor(value)

generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Qwen3VL demo.")

parser.add_argument("--prompt", type=str, default="Describe this image.")
parser.add_argument(
"--image",
type=str,
default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
)
parser.add_argument(
"--model_name", type=str, default="Qwen/Qwen3-VL-4B-Instruct", help="Path to the pre-trained model."
)
parser.add_argument(
"--attn_implementation",
type=str,
default="flash_attention_2",
choices=["flash_attention_2", "eager"],
)

# Parse the arguments
args = parser.parse_args()

generate(args)
91 changes: 91 additions & 0 deletions examples/transformers/qwen3_vl/generate_qwen3_vl_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import argparse
from functools import partial

import numpy as np

import mindspore as ms
import mindspore.mint.distributed as dist
from mindspore.communication import GlobalComm

from mindone.trainers.zero import prepare_network
from mindone.transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration


def generate(args):
model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
args.model_name,
mindspore_dtype=ms.bfloat16,
attn_implementation=args.attn_implementation,
)

# use zero3 parallel
shard_fn = partial(prepare_network, zero_stage=3, optimizer_parallel_group=GlobalComm.WORLD_COMM_GROUP)
model = shard_fn(model)

processor = AutoProcessor.from_pretrained(
args.model_name,
use_fast=False,
)

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": args.image,
},
{
"type": "text",
"text": args.prompt,
},
],
}
]

inputs = processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np"
)

# convert input to Tensor
for key, value in inputs.items():
if isinstance(value, np.ndarray):
inputs[key] = ms.tensor(value)
elif isinstance(value, list):
inputs[key] = ms.Tensor(value)

generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Qwen3VLMoE demo.")

parser.add_argument("--prompt", type=str, default="Describe this image.")
parser.add_argument(
"--image",
type=str,
default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
)
parser.add_argument(
"--model_name", type=str, default="Qwen/Qwen3-VL-30B-A3B-Instruct", help="Path to the pre-trained model."
)
parser.add_argument(
"--attn_implementation",
type=str,
default="flash_attention_2",
choices=["flash_attention_2", "eager"],
)

# Parse the arguments
args = parser.parse_args()

# set up card communication
dist.init_process_group(backend="hccl")
ms.set_auto_parallel_context(parallel_mode="data_parallel")

generate(args)
3 changes: 3 additions & 0 deletions mindone/models/modules/parallel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from .conv import Conv1d, Conv2d, Conv3d, Mint_Conv2d, Mint_Conv3d
from .dense import Dense, Linear
from .moe_text_experts import MoeTextExperts

# {Original MindSpore Cell: New Cell in ZeRO3}
PARALLEL_MODULES = {
Expand All @@ -14,4 +15,6 @@
mint.nn.Linear: Linear,
}

SPECIAL_CASE_FOR_PARALLEL_MODULES = {nn.Cell: MoeTextExperts}

__all__ = ["Conv1d", "Conv2d", "Conv3d", "Mint_Conv2d", "Mint_Conv3d", "Dense", "Linear"]
70 changes: 70 additions & 0 deletions mindone/models/modules/parallel/moe_text_experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Literal, Optional

from mindspore import Tensor
from mindspore import dtype as mstype
from mindspore import mint, nn
from mindspore.communication import get_group_size, get_rank
from mindspore.communication.management import GlobalComm
from mindspore.context import ParallelMode
from mindspore.parallel._utils import _get_parallel_mode

from .param_wrapper import ZeroParamWrapper


class MoeTextExperts(nn.Cell):
def __init__(
self,
net: nn.Cell,
zero_stage: Literal[0, 1, 2, 3] = 0,
optimizer_parallel_group: str = GlobalComm.WORLD_COMM_GROUP,
cell_type: Optional[mstype.Type] = None,
):
super().__init__(auto_prefix=False)
self.net = net
self.set_param_wrapper(zero_stage, optimizer_parallel_group, cell_type)

def set_param_wrapper(self, zero_stage, optimizer_parallel_group, cell_type=None):
self.param_wrapper_gate_up_proj = nn.Identity()
self.param_wrapper_down_proj = nn.Identity()
if zero_stage == 3:
# Init parallel settings
is_parallel = _get_parallel_mode() == ParallelMode.DATA_PARALLEL
op_group_size = get_group_size(optimizer_parallel_group) if is_parallel else 1
op_rank_id = get_rank(optimizer_parallel_group) if is_parallel else 0
self.op_group_size = op_group_size
self.op_rank_id = op_rank_id
self.param_wrapper_gate_up_proj = ZeroParamWrapper(
self.net.gate_up_proj, zero_stage, optimizer_parallel_group, cell_type
)
if self.param_wrapper_gate_up_proj.need_rewrite:
self.net.gate_up_proj.assign_value(
Tensor.from_numpy(
self.net.gate_up_proj.numpy().reshape(op_group_size, -1, *self.net.gate_up_proj.shape[1:])[
op_rank_id
]
)
)
self.param_wrapper_down_proj = ZeroParamWrapper(
self.net.down_proj, zero_stage, optimizer_parallel_group, cell_type
)
if self.param_wrapper_down_proj.need_rewrite:
self.net.down_proj.assign_value(
Tensor.from_numpy(
self.net.down_proj.numpy().reshape(op_group_size, -1, *self.net.down_proj.shape[1:])[op_rank_id]
)
)

def construct(self, hidden_states, routing_weights, router_indices):
batch_size = hidden_states.shape[0]
hidden_states = hidden_states.reshape(-1, self.net.hidden_size) # (num_tokens, hidden_size)

hidden_states = hidden_states.repeat(self.net.num_experts, 1)
hidden_states = hidden_states.view(self.net.num_experts, -1, self.net.hidden_size)

gate_up = mint.bmm(hidden_states, self.param_wrapper_gate_up_proj(self.net.gate_up_proj))
gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors
next_states = mint.bmm((up * self.net.act_fn(gate)), self.param_wrapper_down_proj(self.net.down_proj))
next_states = next_states.reshape(self.net.num_experts, batch_size, -1, self.net.hidden_size)
next_states = next_states * routing_weights.swapaxes(0, 1).view(self.net.num_experts, batch_size, -1)[..., None]
next_states = next_states.sum(dim=0)
return next_states
Loading