Skip to content
Merged
Show file tree
Hide file tree
Changes from 101 commits
Commits
Show all changes
155 commits
Select commit Hold shift + click to select a range
11d759c
only test
zRzRzRzRzRzRzR Dec 17, 2025
10fc39e
Merge branch 'huggingface:main' into cogview
zRzRzRzRzRzRzR Dec 22, 2025
413d2f4
Merge branch 'huggingface:main' into cogview
zRzRzRzRzRzRzR Dec 23, 2025
cd9956c
update
zRzRzRzRzRzRzR Dec 24, 2025
8e83ee7
use mrope
zRzRzRzRzRzRzR Dec 24, 2025
faaf33d
Merge remote-tracking branch 'upstream/main' into cogview
zRzRzRzRzRzRzR Dec 25, 2025
e5bd08e
new kind of impl
zRzRzRzRzRzRzR Dec 25, 2025
ba28d91
1
zRzRzRzRzRzRzR Dec 25, 2025
a136820
with vision?
zRzRzRzRzRzRzR Dec 26, 2025
ea57064
draft projector
zRzRzRzRzRzRzR Dec 27, 2025
e9f15a8
2
zRzRzRzRzRzRzR Dec 27, 2025
931c643
change vit shape
zRzRzRzRzRzRzR Dec 27, 2025
5873a98
use new config
zRzRzRzRzRzRzR Dec 27, 2025
58ada24
no tie
zRzRzRzRzRzRzR Dec 27, 2025
d3b4108
1
zRzRzRzRzRzRzR Dec 27, 2025
d66a0ac
use video token again
zRzRzRzRzRzRzR Dec 27, 2025
a39cf88
1
zRzRzRzRzRzRzR Dec 28, 2025
92a2322
remove video
zRzRzRzRzRzRzR Dec 28, 2025
67a59cf
Update modeling_glm_image.py
zRzRzRzRzRzRzR Dec 28, 2025
1da6998
1
zRzRzRzRzRzRzR Dec 30, 2025
cac0dc7
update
zRzRzRzRzRzRzR Dec 30, 2025
52aeace
Update modeling_glm_image.py
zRzRzRzRzRzRzR Dec 31, 2025
4e1eed3
update for test working
zRzRzRzRzRzRzR Dec 31, 2025
b4613d6
2
zRzRzRzRzRzRzR Jan 2, 2026
724275b
Delete modeling_siglip_tokenizer.py
zRzRzRzRzRzRzR Jan 2, 2026
8eceb91
1
zRzRzRzRzRzRzR Jan 2, 2026
da0d493
Delete modeling_siglip_tokenizer.py
zRzRzRzRzRzRzR Jan 2, 2026
67403d2
draft of vq
zRzRzRzRzRzRzR Jan 2, 2026
6f3c0c3
3
zRzRzRzRzRzRzR Jan 2, 2026
cff9919
2
zRzRzRzRzRzRzR Jan 2, 2026
dd71e05
testing
zRzRzRzRzRzRzR Jan 2, 2026
14db6fc
tes1
zRzRzRzRzRzRzR Jan 2, 2026
087cf3f
2
zRzRzRzRzRzRzR Jan 2, 2026
0b5360d
1
zRzRzRzRzRzRzR Jan 2, 2026
dd10578
12
zRzRzRzRzRzRzR Jan 2, 2026
bb4276b
using interpolate_pos_encoding
zRzRzRzRzRzRzR Jan 2, 2026
6c75bd3
vit prepare!
zRzRzRzRzRzRzR Jan 2, 2026
e0884b8
add processor
zRzRzRzRzRzRzR Jan 3, 2026
3d48d31
Delete modeling_siglip_flux_zh.py
zRzRzRzRzRzRzR Jan 3, 2026
fcdfdfc
2
zRzRzRzRzRzRzR Jan 3, 2026
7c34f14
input change
zRzRzRzRzRzRzR Jan 3, 2026
feb2bcb
add doc
zRzRzRzRzRzRzR Jan 3, 2026
d8823a2
Update glm_image.md
zRzRzRzRzRzRzR Jan 3, 2026
1f13301
bilinear
zRzRzRzRzRzRzR Jan 3, 2026
08a0078
using Qwen processing for multi image
zRzRzRzRzRzRzR Jan 3, 2026
5b2b3d9
update
zRzRzRzRzRzRzR Jan 4, 2026
3566f18
1
zRzRzRzRzRzRzR Jan 4, 2026
34738f5
4
zRzRzRzRzRzRzR Jan 4, 2026
9f4fea8
4
zRzRzRzRzRzRzR Jan 4, 2026
63edc1b
work
zRzRzRzRzRzRzR Jan 4, 2026
4361681
add fast processor
zRzRzRzRzRzRzR Jan 4, 2026
a7737b1
Update image_processing_auto.py
zRzRzRzRzRzRzR Jan 4, 2026
19daabf
GlmImageVQVAEResnetBlock
zRzRzRzRzRzRzR Jan 4, 2026
91bbfbb
2
zRzRzRzRzRzRzR Jan 4, 2026
27970c9
2
zRzRzRzRzRzRzR Jan 4, 2026
c853d12
using with new position
zRzRzRzRzRzRzR Jan 4, 2026
dc8e246
2
zRzRzRzRzRzRzR Jan 4, 2026
4b660e0
update
zRzRzRzRzRzRzR Jan 4, 2026
a5db1f0
1
zRzRzRzRzRzRzR Jan 4, 2026
d27b79f
preprocessing
zRzRzRzRzRzRzR Jan 4, 2026
1878f3b
2
zRzRzRzRzRzRzR Jan 4, 2026
577b923
for multi image
zRzRzRzRzRzRzR Jan 4, 2026
cd8d78f
2
zRzRzRzRzRzRzR Jan 4, 2026
a689905
for new decode
zRzRzRzRzRzRzR Jan 5, 2026
6c8b1ee
format
zRzRzRzRzRzRzR Jan 5, 2026
8cc46ed
doc
zRzRzRzRzRzRzR Jan 5, 2026
0bb1610
Merge branch 'main' into cogview
zRzRzRzRzRzRzR Jan 5, 2026
29afb44
1
zRzRzRzRzRzRzR Jan 5, 2026
899c3fc
using right patch_size
zRzRzRzRzRzRzR Jan 5, 2026
ea58b59
fix copy?
zRzRzRzRzRzRzR Jan 5, 2026
9e678ed
add para
zRzRzRzRzRzRzR Jan 5, 2026
f4ebfec
update
zRzRzRzRzRzRzR Jan 5, 2026
e3604b5
image token
zRzRzRzRzRzRzR Jan 5, 2026
fb07e1e
not working for fix_and_overwrite
zRzRzRzRzRzRzR Jan 5, 2026
3024962
remove indentation
zRzRzRzRzRzRzR Jan 5, 2026
1c940da
remove resnet
zRzRzRzRzRzRzR Jan 5, 2026
e67e0fa
add
zRzRzRzRzRzRzR Jan 5, 2026
b179db8
fix
zRzRzRzRzRzRzR Jan 5, 2026
7312ed2
temporal_patch_size remove
zRzRzRzRzRzRzR Jan 5, 2026
31623f9
support processor
zRzRzRzRzRzRzR Jan 5, 2026
042249a
update for some test
zRzRzRzRzRzRzR Jan 5, 2026
93ee4ca
Merge branch 'main' into cogview
zRzRzRzRzRzRzR Jan 5, 2026
7a3b6de
2
zRzRzRzRzRzRzR Jan 5, 2026
8394eb1
Merge branch 'cogview' of github.com:zRzRzRzRzRzRzR/transformers into…
zRzRzRzRzRzRzR Jan 5, 2026
40c9b65
update1
zRzRzRzRzRzRzR Jan 5, 2026
0f5ed53
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 5, 2026
4e0784e
update2
zRzRzRzRzRzRzR Jan 5, 2026
147daaf
update 2
zRzRzRzRzRzRzR Jan 5, 2026
19fcd6f
3
zRzRzRzRzRzRzR Jan 5, 2026
07d1942
4
zRzRzRzRzRzRzR Jan 5, 2026
58453a7
rebase init weight
zRzRzRzRzRzRzR Jan 5, 2026
13bc79f
check_docstrings
zRzRzRzRzRzRzR Jan 5, 2026
761bd87
fix some generation tests
zucchini-nlp Jan 6, 2026
091c0a0
skip the rest of tests
zucchini-nlp Jan 6, 2026
f309dee
add get_image_tokens
zRzRzRzRzRzRzR Jan 8, 2026
6591895
unused code
zucchini-nlp Jan 8, 2026
25ffbd0
Merge branch 'main' into cogview
zRzRzRzRzRzRzR Jan 8, 2026
9ba7540
update for main change?
zRzRzRzRzRzRzR Jan 8, 2026
68e0e15
using main typo
zRzRzRzRzRzRzR Jan 8, 2026
ff63ba0
fix FA2
zucchini-nlp Jan 8, 2026
2c1034a
update doc
zRzRzRzRzRzRzR Jan 8, 2026
b0393da
push rope index update
zucchini-nlp Jan 8, 2026
31151d3
GlmImageTextRotaryEmbedding
zRzRzRzRzRzRzR Jan 8, 2026
300234b
Delete test.png
zRzRzRzRzRzRzR Jan 8, 2026
941f875
1
zRzRzRzRzRzRzR Jan 8, 2026
3cb5c54
update
zRzRzRzRzRzRzR Jan 8, 2026
1c73033
3
zRzRzRzRzRzRzR Jan 8, 2026
6cf7ebb
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 8, 2026
df6d359
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 8, 2026
7063523
1
zRzRzRzRzRzRzR Jan 8, 2026
80629be
simply modular
zRzRzRzRzRzRzR Jan 9, 2026
998021a
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 9, 2026
28baf48
doc update
zRzRzRzRzRzRzR Jan 9, 2026
2b7884b
Update glmasr.md
zRzRzRzRzRzRzR Jan 9, 2026
1b2b63b
Merge branch 'main' into cogview
zRzRzRzRzRzRzR Jan 9, 2026
00a8e12
update attn
zRzRzRzRzRzRzR Jan 9, 2026
16f77aa
make position ids shape correct but needs checking values with mult i…
zucchini-nlp Jan 9, 2026
9ef5286
revert
zRzRzRzRzRzRzR Jan 9, 2026
3405107
revert
zRzRzRzRzRzRzR Jan 9, 2026
8184713
update
zRzRzRzRzRzRzR Jan 9, 2026
8a37eeb
1
zRzRzRzRzRzRzR Jan 9, 2026
526a960
1
zRzRzRzRzRzRzR Jan 9, 2026
53f6a01
2
zRzRzRzRzRzRzR Jan 9, 2026
8092122
must add device change
zRzRzRzRzRzRzR Jan 9, 2026
4b4380e
1
zRzRzRzRzRzRzR Jan 9, 2026
0886080
update
zRzRzRzRzRzRzR Jan 9, 2026
5ec417e
using llama type
zRzRzRzRzRzRzR Jan 9, 2026
fa50824
2
zRzRzRzRzRzRzR Jan 9, 2026
4c511ba
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 9, 2026
4d86dc0
models can't run, fix
zucchini-nlp Jan 9, 2026
33bd7a9
position ids, second try. Should work now
zucchini-nlp Jan 9, 2026
90e7768
Update modular_glm_image.py
zRzRzRzRzRzRzR Jan 10, 2026
f334e99
remove
zRzRzRzRzRzRzR Jan 12, 2026
b93a714
move prompt expand inside processing
zucchini-nlp Jan 12, 2026
f2e9ff4
typos and tiny fixes
zucchini-nlp Jan 12, 2026
bf95580
make it runnable with example script
zucchini-nlp Jan 12, 2026
c8c723b
nit: let's follow standard API
zucchini-nlp Jan 12, 2026
238d6db
using right
zRzRzRzRzRzRzR Jan 12, 2026
d55151e
Merge branch 'cogview' of github.com:zRzRzRzRzRzRzR/transformers into…
zRzRzRzRzRzRzR Jan 12, 2026
ac9cee1
update doc
zRzRzRzRzRzRzR Jan 12, 2026
74a467d
update
zRzRzRzRzRzRzR Jan 12, 2026
82c0530
update
zRzRzRzRzRzRzR Jan 12, 2026
fe7650d
resolution changed
zRzRzRzRzRzRzR Jan 12, 2026
fc582db
udate
zRzRzRzRzRzRzR Jan 12, 2026
9468522
1
zRzRzRzRzRzRzR Jan 12, 2026
34eae52
Merge branch 'main' into cogview
zRzRzRzRzRzRzR Jan 12, 2026
e27fd18
2
zRzRzRzRzRzRzR Jan 12, 2026
2d84676
3
zRzRzRzRzRzRzR Jan 12, 2026
a137785
Update check_repo.py
zRzRzRzRzRzRzR Jan 12, 2026
d750318
skip/overwrite tests
zucchini-nlp Jan 12, 2026
ef3af15
Merge branch 'main' into cogview
zucchini-nlp Jan 12, 2026
05510d6
Merge branch 'main' into cogview
sayakpaul Jan 13, 2026
0be1887
swap h and w in position ids!
zucchini-nlp Jan 13, 2026
8b3336f
Merge branch 'main' into cogview
zucchini-nlp Jan 13, 2026
d4350b4
require read token does not exist anymore. Wait, why is that not fixe…
zucchini-nlp Jan 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,8 @@
title: glm4
- local: model_doc/glm4_moe
title: glm4_moe
- local: model_doc/glm_image
title: GlmImage
- local: model_doc/openai-gpt
title: GPT
- local: model_doc/gpt_neo
Expand Down
247 changes: 247 additions & 0 deletions docs/source/en/model_doc/glm_image.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
<!--Copyright 2025 the HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.
-->
*This model was released on {release_date} and added to Hugging Face Transformers on 2026-01-10.*

# GlmImage

## Overview

GLM-Image is an image generation model adopts a hybrid autoregressive + diffusion decoder architecture, effectively pushing the upper bound of visual fidelity and fine-grained details. In general image generation quality, it aligns with industry-standard LDM-based approaches, while demonstrating significant advantages in knowledge-intensive image generation scenarios.

Model architecture: a hybrid autoregressive + diffusion decoder design、

+ Autoregressive generator: a 9B-parameter model initialized from [GLM-4-9B-0414](https://huggingface.co/zai-org/GLM-4-9B-0414), with an expanded vocabulary to incorporate visual tokens. The model first generates a compact encoding of approximately 256 tokens, then expands to 1K–4K tokens, corresponding to 1K–2K high-resolution image outputs.
+ Diffusion Decoder: a 7B-parameter decoder based on a single-stream DiT architecture for latent-space image decoding. It is equipped with a Glyph Encoder text module, significantly improving accurate text rendering within images.

Post-training with decoupled reinforcement learning: the model introduces a fine-grained, modular feedback strategy using the GRPO algorithm, substantially enhancing both semantic understanding and visual detail quality.

+ Autoregressive module: provides low-frequency feedback signals focused on aesthetics and semantic alignment, improving instruction following and artistic expressiveness.
+ Decoder module: delivers high-frequency feedback targeting detail fidelity and text accuracy, resulting in highly realistic textures, lighting, and color reproduction, as well as more precise text rendering.

GLM-Image supports both text-to-image and image-to-image generation within a single model

+ Text-to-image: generates high-detail images from textual descriptions, with particularly strong performance in information-dense scenarios.
+ Image-to-image: supports a wide range of tasks, including image editing, style transfer, multi-subject consistency, and identity-preserving generation for people and objects.

+ `GlmImageForConditionalGeneration` is the AR part of GLM-Image model, and for full image generation pipeline, please refer to [here](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/glm_image).

This model was contributed by [Raushan Turganbay](https://huggingface.co/RaushanTurganbay) and [Yuxuan Zhang](https://huggingface.co/ZHANGYUXUAN-zR).

## Usage examples

Using GLM-Image with image input to generate vision token for DIT using.

### Text-to-Image Generation

```python
from transformers import GlmImageForConditionalGeneration, AutoProcessor
import torch
import re
from math import sqrt

# Load model and processor
model_id = "zai-org/GLM-Image"
model = GlmImageForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_id, use_fast=True)


def parse_shape_info(prompt: str) -> tuple[str, int, int, int, int]:
"""Parse image dimensions and expand shape tokens for two-stage generation."""
match = re.search(r'<sop>(\d+)\s+(\d+)<eop>', prompt)
token_h, token_w = int(match.group(1)), int(match.group(2))
ratio = token_h / token_w
prev_token_h = int(sqrt(ratio) * 16)
prev_token_w = int(sqrt(1 / ratio) * 16)

old_shape = f'<sop>{token_h} {token_w}<eop>'
new_shape = f'<sop>{token_h} {token_w}<eop><sop>{prev_token_h} {prev_token_w}<eop>'
expanded_prompt = prompt.replace(old_shape, new_shape)

return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be part of processor's call imo, rather than asking users to compute h/w every time

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only takes effect during text-to-image generation. Additionally, the second generated {token_h} {token_w} {prev_token_h} {prev_token_w} also needs to be fed into the tokenizer as text.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can still do in processor imo. If it's only text-to-image, we check if images is None and then self.apply_text_only_processing(text). After that we can pass it to tokenizer



# Text-to-Image Generation
prompt = "A cute cartoon-style text design featuring the word 'Taro' in clean, bright white rounded letters with a soft, hand-drawn feel. The background is a gentle taro purple with a misty gradient effect, decorated with small stars, hearts, and bubble elements. The overall atmosphere is light and sweet, with soft lighting like afternoon sunshine casting a warm glow from the upper left.<sop>36 24<eop>"

prompt, token_h, token_w, prev_h, prev_w = parse_shape_info(prompt)
print(f"Large image: {token_h} x {token_w} = {token_h * token_w} tokens")
print(f"Small image: {prev_h} x {prev_w} = {prev_h * prev_w} tokens")

messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]

inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)

# Build image grid for two-stage generation (small image + large image)
inputs["image_grid_thw"] = torch.tensor([
[1, token_h, token_w],
[1, prev_h, prev_w],
])

# Calculate generation parameters
small_image_tokens = prev_h * prev_w
large_image_tokens = token_h * token_w
max_new_tokens = small_image_tokens + large_image_tokens + 1

inputs = inputs.to(model.device)

# Generate image tokens
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True
)

# Extract large image tokens (skip small image tokens)
input_length = inputs["input_ids"].shape[-1]
generated_tokens = outputs[0][input_length:]
large_image_tokens_ids = generated_tokens[small_image_tokens:small_image_tokens + large_image_tokens].tolist()

print(f"Total generated tokens: {len(outputs[0]) - input_length}")
print(f"Large image tokens: {len(large_image_tokens_ids)}")
```

### Image-to-Image Generation

A portion of the Text-to-Image script can be modified—specifically the prompt and input sections—to implement
Image-to-Image generation:

```python
# Image-to-Image Generation
from PIL import Image

prompt = "Transform this image into a watercolor painting style with soft, flowing brushstrokes and pastel colors.<sop>36 24<eop>"

prompt, token_h, token_w, prev_h, prev_w = parse_shape_info(prompt)
print(f"Large image: {token_h} x {token_w} = {token_h * token_w} tokens")
print(f"Small image: {prev_h} x {prev_w} = {prev_h * prev_w} tokens")

# Load input image
image_path = "input.png" # Replace with your image path

messages = [
{
"role": "user",
"content": [
{"type": "image", "url": image_path},
{"type": "text", "text": prompt},
],
}
]

inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)

# Get existing image grid from input image and append target image dimensions
existing_grid = inputs.get("image_grid_thw")
inputs["image_grid_thw"] = torch.cat([
existing_grid,
torch.tensor([[1, token_h, token_w]])
], dim=0)

# For image-to-image, only generate large image tokens (no small preview needed)
large_image_tokens = token_h * token_w
max_new_tokens = large_image_tokens + 1

inputs = inputs.to(model.device)

# Generate image tokens
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True
)

# Extract generated image tokens
input_length = inputs["input_ids"].shape[-1]
generated_tokens = outputs[0][input_length:]
large_image_tokens_ids = generated_tokens[:large_image_tokens].tolist()

print(f"Total generated tokens: {len(outputs[0]) - input_length}")
print(f"Large image tokens: {len(large_image_tokens_ids)}")
```

## GlmImageConfig

[[autodoc]] GlmImageConfig

## GlmImageVisionConfig

[[autodoc]] GlmImageVisionConfig

## GlmImageTextConfig

[[autodoc]] GlmImageTextConfig

## GlmImageVQVAEConfig

[[autodoc]] GlmImageVQVAEConfig

## GlmImageImageProcessor

[[autodoc]] GlmImageImageProcessor
- preprocess

## GlmImageImageProcessorFast

[[autodoc]] GlmImageImageProcessorFast
- preprocess

## GlmImageProcessor

[[autodoc]] GlmImageProcessor

## GlmImageVisionModel

[[autodoc]] GlmImageVisionModel
- forward

## GlmImageTextModel

[[autodoc]] GlmImageTextModel
- forward

## GlmImageVQVAE

[[autodoc]] GlmImageVQVAE
- forward

## GlmImageModel

[[autodoc]] GlmImageModel
- forward

## GlmImageForConditionalGeneration

[[autodoc]] GlmImageForConditionalGeneration
- forward
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@
from .glm4v import *
from .glm4v_moe import *
from .glm46v import *
from .glm_image import *
from .glmasr import *
from .glpn import *
from .got_ocr2 import *
Expand Down
11 changes: 11 additions & 0 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@
("glm4v_moe_vision", "Glm4vMoeVisionConfig"),
("glm4v_text", "Glm4vTextConfig"),
("glm4v_vision", "Glm4vVisionConfig"),
("glm_image", "GlmImageConfig"),
("glm_image_text", "GlmImageTextConfig"),
("glm_image_vision", "GlmImageVisionConfig"),
("glm_image_vqmodel", "GlmImageVQVAEConfig"),
("glmasr", "GlmAsrConfig"),
("glmasr_encoder", "GlmAsrEncoderConfig"),
("glpn", "GLPNConfig"),
Expand Down Expand Up @@ -633,6 +637,10 @@
("glm4v_moe_vision", "Glm4vMoeVisionModel"),
("glm4v_text", "GLM4V"),
("glm4v_vision", "Glm4vVisionModel"),
("glm_image", "GlmImage"),
("glm_image_text", "GlmImage"),
("glm_image_vision", "GlmImageVisionModel"),
("glm_image_vqmodel", "GlmImageVQVAE"),
("glmasr", "GLM-ASR"),
("glmasr_encoder", "GLM-ASR Encoder"),
("glpn", "GLPN"),
Expand Down Expand Up @@ -976,6 +984,9 @@
("glm4v_moe_vision", "glm4v_moe"),
("glm4v_text", "glm4v"),
("glm4v_moe_text", "glm4v_moe"),
("glm_image_vision", "glm_image"),
("glm_image_vqmodel", "glm_image"),
("glm_image_text", "glm_image"),
("glmasr_encoder", "glmasr"),
("grounding-dino", "grounding_dino"),
("mm-grounding-dino", "mm_grounding_dino"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
("glm46v", ("Glm46VImageProcessor", "Glm46VImageProcessorFast")),
("glm4v", ("Glm4vImageProcessor", "Glm4vImageProcessorFast")),
("glm_image", ("GlmImageImageProcessor", "GlmImageImageProcessorFast")),
("glpn", ("GLPNImageProcessor", "GLPNImageProcessorFast")),
("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("glm4v_moe_vision", "Glm4vMoeVisionModel"),
("glm4v_text", "Glm4vTextModel"),
("glm4v_vision", "Glm4vVisionModel"),
("glm_image", "GlmImageModel"),
("glm_image_text", "GlmImageTextModel"),
("glm_image_vision", "GlmImageVisionModel"),
("glm_image_vqmodel", "GlmImageVQVAE"),
("glmasr", "GlmAsrForConditionalGeneration"),
("glmasr_encoder", "GlmAsrEncoder"),
("glpn", "GLPNModel"),
Expand Down Expand Up @@ -1022,6 +1026,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
("glm46v", "Glm46VForConditionalGeneration"),
("glm4v", "Glm4vForConditionalGeneration"),
("glm4v_moe", "Glm4vMoeForConditionalGeneration"),
("glm_image", "GlmImageForConditionalGeneration"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it can't generate text, only image no? in that case it;s better suited under Multimodal LM mapping

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This model generates image tokens, but it is indeed a decode-only model with eos and bos in the output. Should it be placed here?

Copy link
Member

@zucchini-nlp zucchini-nlp Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now that I saw that it is part of a diffusion pipe, it's hard to categorize it anywhere. The model output isn't final and must be decoded further in diffusion part. I think we just don't add it in auto-map in that case, model can't be used as a standalone and we have no mapping for this types of models

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, we're removing it here, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, let's remove

("got_ocr2", "GotOcr2ForConditionalGeneration"),
("idefics", "IdeficsForVisionText2Text"),
("idefics2", "Idefics2ForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
("glm46v", "Glm46VProcessor"),
("glm4v", "Glm4vProcessor"),
("glm4v_moe", "Glm4vProcessor"),
("glm_image", "Glm4vProcessor"),
("glmasr", "GlmAsrProcessor"),
("got_ocr2", "GotOcr2Processor"),
("granite_speech", "GraniteSpeechProcessor"),
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,14 @@
("gemma3n", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma3n_text", "GemmaTokenizer" if is_tokenizers_available() else None),
("git", "BertTokenizer" if is_tokenizers_available() else None),
("glm", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4_moe", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4v", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4v_moe", "TokenizersBackend" if is_tokenizers_available() else None),
("glm_image", "TokenizersBackend" if is_tokenizers_available() else None),
("glmasr", "TokenizersBackend" if is_tokenizers_available() else None),
("got_ocr2", "TokenizersBackend" if is_tokenizers_available() else None),
("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None),
("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None),
("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None),
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm4v/modeling_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,7 @@ def forward(
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
position_ids=text_position_ids,
past_key_values=past_key_values,
cache_position=cache_position,
position_embeddings=position_embeddings,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glm4v/modular_glm4v.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ def forward(
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
position_ids=text_position_ids,
past_key_values=past_key_values,
cache_position=cache_position,
position_embeddings=position_embeddings,
Expand Down
31 changes: 31 additions & 0 deletions src/transformers/models/glm_image/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2025 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

from ...utils import _LazyModule
from ...utils.import_utils import define_import_structure


if TYPE_CHECKING:
from .configuration_glm_image import *
from .image_processing_glm_image import *
from .image_processing_glm_image_fast import *
from .modeling_glm_image import *
from .processing_glm_image import *
else:
import sys

_file = globals()["__file__"]
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
Loading