Skip to content

GLM4.6V flash does not work #203

@gaynetdinov

Description

@gaynetdinov

Despite recently updated README GLM 4.6V flash does not actually work.

My devices

$ docker exec -it llm-scaler-vllm bash

:: initializing oneAPI environment ...
   bash: BASH_VERSION = 5.2.21(1)-release
   args: Using "$@" for setvars.sh arguments: --force
:: ccl -- latest
:: compiler -- latest
:: debugger -- latest
:: dev-utilities -- latest
:: dnnl -- latest
:: dpcpp-ct -- latest
:: dpl -- latest
:: mkl -- latest
:: mpi -- latest
:: pti -- latest
:: tbb -- latest
:: umf -- latest
:: oneAPI environment initialized ::

root@7b3a1edbd93a:/llm# sycl-ls
[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Graphics [0xe211] 20.1.0 [1.13.35563+7]
[level_zero:gpu][level_zero:1] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) UHD Graphics 770 12.2.0 [1.13.35563+7]
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i5-12500 OpenCL 3.0 (Build 0) [2025.19.4.0.18_160000.xmain-hotfix]
[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Graphics [0xe211] OpenCL 3.0 NEO  [25.40.35563.7]
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 770 OpenCL 3.0 NEO  [25.40.35563.7]

My Dockerfile

FROM intel/llm-scaler-vllm:1.2

# Required for GLM-4.6V / GLM-4.6V-Flash processor support
RUN python3 -m pip install "transformers==5.0.0rc0"
docker build --no-cache -t llm-scaler-vllm:glm46v .

docker-compose

services:
  llm-scaler-vllm:
    restart: unless-stopped
    image: llm-scaler-vllm:glm46v
    container_name: llm-scaler-vllm
    privileged: true
    devices:
      - /dev/dri:/dev/dri
    shm_size: "32g"

    volumes:
      - /home/damirca/.cache/huggingface:/root/.cache/huggingface
    environment:
      - HF_TOKEN=<...>
      - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub
      - VLLM_API_KEY=<...>

    ports:
      - "9000:8000"

    entrypoint:
      - python3
      - -m
      - vllm.entrypoints.openai.api_server
    command:
      - --model
      - zai-org/GLM-4.6V-Flash
      - --dtype
      - float16
      - --gpu-memory-util
      - "0.95"
      - --enforce-eager
      - --max-model-len
      - "8192"
      - --block-size
      - "64"
      - --max-num-batched-tokens
      - "8192"
      - --no-enable-prefix-caching

  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open-webui
    restart: unless-stopped
    ports:
      - "5080:8080"            
    volumes:
      - open-webui-data:/app/backend/data
    environment:
      TZ: "Europe/Berlin"
      VLLM_DISABLE_TOKENIZER_GROUP: 1

volumes:
  open-webui-data:

The error when I start the container

llm-scaler-vllm  | (APIServer pid=1) INFO 12-23 09:12:38 [api_server.py:1896] vLLM API server version 0.10.3.dev0+g01efc7ef7.d20251125
llm-scaler-vllm  | (APIServer pid=1) INFO 12-23 09:12:38 [utils.py:328] non-default args: {'model': 'zai-org/GLM-4.6V-Flash', 'dtype': 'float16', 'max_model_len': 8192, 'enforce_eager': True, 'block_size': 64, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False, 'max_num_batched_tokens': 8192}
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:38,616 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:38,640 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:38,768 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:38,791 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
open-webui       | WARNI [langchain_community.utils.user_agent] USER_AGENT environment variable not set, consider setting it to identify your requests.
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,113 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,137 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
llm-scaler-vllm  | (APIServer pid=1) Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'partial_rotary_factor', 'mrope_section'}
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,277 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/models/zai-org/GLM-4.6V-Flash/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,414 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/models/zai-org/GLM-4.6V-Flash/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,547 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/processor_config.json "HTTP/1.1 404 Not Found"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,685 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/preprocessor_config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm  | (APIServer pid=1) 2025-12-23 09:12:39,709 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/preprocessor_config.json "HTTP/1.1 200 OK"
open-webui       |
open-webui       |  ██████╗ ██████╗ ███████╗███╗   ██╗    ██╗    ██╗███████╗██████╗ ██╗   ██╗██╗
open-webui       | ██╔═══██╗██╔══██╗██╔════╝████╗  ██║    ██║    ██║██╔════╝██╔══██╗██║   ██║██║
open-webui       | ██║   ██║██████╔╝█████╗  ██╔██╗ ██║    ██║ █╗ ██║█████╗  ██████╔╝██║   ██║██║
open-webui       | ██║   ██║██╔═══╝ ██╔══╝  ██║╚██╗██║    ██║███╗██║██╔══╝  ██╔══██╗██║   ██║██║
open-webui       | ╚██████╔╝██║     ███████╗██║ ╚████║    ╚███╔███╔╝███████╗██████╔╝╚██████╔╝██║
open-webui       |  ╚═════╝ ╚═╝     ╚══════╝╚═╝  ╚═══╝     ╚══╝╚══╝ ╚══════╝╚═════╝  ╚═════╝ ╚═╝
open-webui       |
open-webui       |
open-webui       | v0.6.41 - building the best AI user interface.
open-webui       |
open-webui       | https://github.com/open-webui/open-webui
open-webui       |
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 351477.99it/s]
open-webui       | INFO:     Started server process [1]
open-webui       | INFO:     Waiting for application startup.
open-webui       | 2025-12-23 10:12:42.231 | INFO     | open_webui.utils.logger:start_logger:162 - GLOBAL_LOG_LEVEL: INFO
open-webui       | 2025-12-23 10:12:42.231 | INFO     | open_webui.main:lifespan:579 - Installing external dependencies of functions and tools...
open-webui       | 2025-12-23 10:12:42.238 | INFO     | open_webui.utils.plugin:install_frontmatter_requirements:283 - No requirements found in frontmatter.
llm-scaler-vllm  | (APIServer pid=1) INFO 12-23 09:12:45 [__init__.py:743] Resolved architecture: Glm4vForConditionalGeneration
llm-scaler-vllm  | (APIServer pid=1) WARNING 12-23 09:12:45 [_logger.py:68] Casting torch.bfloat16 to torch.float16.
llm-scaler-vllm  | (APIServer pid=1) `torch_dtype` is deprecated! Use `dtype` instead!
llm-scaler-vllm  | (APIServer pid=1) INFO 12-23 09:12:45 [__init__.py:1816] Using max model len 8192
llm-scaler-vllm  | (APIServer pid=1) Traceback (most recent call last):
llm-scaler-vllm  | (APIServer pid=1)   File "<frozen runpy>", line 198, in _run_module_as_main
llm-scaler-vllm  | (APIServer pid=1)   File "<frozen runpy>", line 88, in _run_code
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 2011, in <module>
llm-scaler-vllm  | (APIServer pid=1)     uvloop.run(run_server(args))
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
llm-scaler-vllm  | (APIServer pid=1)     return __asyncio.run(
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
llm-scaler-vllm  | (APIServer pid=1)     return runner.run(main)
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
llm-scaler-vllm  | (APIServer pid=1)     return self._loop.run_until_complete(task)
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
llm-scaler-vllm  | (APIServer pid=1)     return await main
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 1941, in run_server
llm-scaler-vllm  | (APIServer pid=1)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 1961, in run_server_worker
llm-scaler-vllm  | (APIServer pid=1)     async with build_async_engine_client(
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
llm-scaler-vllm  | (APIServer pid=1)     return await anext(self.gen)
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 179, in build_async_engine_client
llm-scaler-vllm  | (APIServer pid=1)     async with build_async_engine_client_from_engine_args(
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
llm-scaler-vllm  | (APIServer pid=1)     return await anext(self.gen)
llm-scaler-vllm  | (APIServer pid=1)            ^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 205, in build_async_engine_client_from_engine_args
llm-scaler-vllm  | (APIServer pid=1)     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
llm-scaler-vllm  | (APIServer pid=1)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/engine/arg_utils.py", line 1337, in create_engine_config
llm-scaler-vllm  | (APIServer pid=1)     speculative_config = self.create_speculative_config(
llm-scaler-vllm  | (APIServer pid=1)                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/engine/arg_utils.py", line 1064, in create_speculative_config
llm-scaler-vllm  | (APIServer pid=1)     from vllm.transformers_utils.configs.speculators.base import (
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/transformers_utils/configs/__init__.py", line 28, in <module>
llm-scaler-vllm  | (APIServer pid=1)     from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
llm-scaler-vllm  | (APIServer pid=1)   File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/transformers_utils/configs/qwen3_next.py", line 21, in <module>
llm-scaler-vllm  | (APIServer pid=1)     from transformers.modeling_rope_utils import rope_config_validation
llm-scaler-vllm  | (APIServer pid=1) ImportError: cannot import name 'rope_config_validation' from 'transformers.modeling_rope_utils' (/usr/local/lib/python3.12/dist-packages/transformers/modeling_rope_utils.py)
llm-scaler-vllm exited with code 1 (restarting)

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions