-
Notifications
You must be signed in to change notification settings - Fork 15
Open
Labels
Description
Despite recently updated README GLM 4.6V flash does not actually work.
My devices
$ docker exec -it llm-scaler-vllm bash
:: initializing oneAPI environment ...
bash: BASH_VERSION = 5.2.21(1)-release
args: Using "$@" for setvars.sh arguments: --force
:: ccl -- latest
:: compiler -- latest
:: debugger -- latest
:: dev-utilities -- latest
:: dnnl -- latest
:: dpcpp-ct -- latest
:: dpl -- latest
:: mkl -- latest
:: mpi -- latest
:: pti -- latest
:: tbb -- latest
:: umf -- latest
:: oneAPI environment initialized ::
root@7b3a1edbd93a:/llm# sycl-ls
[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Graphics [0xe211] 20.1.0 [1.13.35563+7]
[level_zero:gpu][level_zero:1] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) UHD Graphics 770 12.2.0 [1.13.35563+7]
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i5-12500 OpenCL 3.0 (Build 0) [2025.19.4.0.18_160000.xmain-hotfix]
[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Graphics [0xe211] OpenCL 3.0 NEO [25.40.35563.7]
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 770 OpenCL 3.0 NEO [25.40.35563.7]
My Dockerfile
FROM intel/llm-scaler-vllm:1.2
# Required for GLM-4.6V / GLM-4.6V-Flash processor support
RUN python3 -m pip install "transformers==5.0.0rc0"
docker build --no-cache -t llm-scaler-vllm:glm46v .
docker-compose
services:
llm-scaler-vllm:
restart: unless-stopped
image: llm-scaler-vllm:glm46v
container_name: llm-scaler-vllm
privileged: true
devices:
- /dev/dri:/dev/dri
shm_size: "32g"
volumes:
- /home/damirca/.cache/huggingface:/root/.cache/huggingface
environment:
- HF_TOKEN=<...>
- HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub
- VLLM_API_KEY=<...>
ports:
- "9000:8000"
entrypoint:
- python3
- -m
- vllm.entrypoints.openai.api_server
command:
- --model
- zai-org/GLM-4.6V-Flash
- --dtype
- float16
- --gpu-memory-util
- "0.95"
- --enforce-eager
- --max-model-len
- "8192"
- --block-size
- "64"
- --max-num-batched-tokens
- "8192"
- --no-enable-prefix-caching
open-webui:
image: ghcr.io/open-webui/open-webui:main
container_name: open-webui
restart: unless-stopped
ports:
- "5080:8080"
volumes:
- open-webui-data:/app/backend/data
environment:
TZ: "Europe/Berlin"
VLLM_DISABLE_TOKENIZER_GROUP: 1
volumes:
open-webui-data:
The error when I start the container
llm-scaler-vllm | (APIServer pid=1) INFO 12-23 09:12:38 [api_server.py:1896] vLLM API server version 0.10.3.dev0+g01efc7ef7.d20251125
llm-scaler-vllm | (APIServer pid=1) INFO 12-23 09:12:38 [utils.py:328] non-default args: {'model': 'zai-org/GLM-4.6V-Flash', 'dtype': 'float16', 'max_model_len': 8192, 'enforce_eager': True, 'block_size': 64, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False, 'max_num_batched_tokens': 8192}
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:38,616 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:38,640 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:38,768 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:38,791 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
open-webui | WARNI [langchain_community.utils.user_agent] USER_AGENT environment variable not set, consider setting it to identify your requests.
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,113 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,137 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/config.json "HTTP/1.1 200 OK"
llm-scaler-vllm | (APIServer pid=1) Unrecognized keys in `rope_parameters` for 'rope_type'='default': {'partial_rotary_factor', 'mrope_section'}
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,277 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/models/zai-org/GLM-4.6V-Flash/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,414 - httpx - INFO - HTTP Request: GET https://huggingface.co/api/models/zai-org/GLM-4.6V-Flash/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,547 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/processor_config.json "HTTP/1.1 404 Not Found"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,685 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/zai-org/GLM-4.6V-Flash/resolve/main/preprocessor_config.json "HTTP/1.1 307 Temporary Redirect"
llm-scaler-vllm | (APIServer pid=1) 2025-12-23 09:12:39,709 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/zai-org/GLM-4.6V-Flash/411bb4d77144a3f03accbf4b780f5acb8b7cde4e/preprocessor_config.json "HTTP/1.1 200 OK"
open-webui |
open-webui | ██████╗ ██████╗ ███████╗███╗ ██╗ ██╗ ██╗███████╗██████╗ ██╗ ██╗██╗
open-webui | ██╔═══██╗██╔══██╗██╔════╝████╗ ██║ ██║ ██║██╔════╝██╔══██╗██║ ██║██║
open-webui | ██║ ██║██████╔╝█████╗ ██╔██╗ ██║ ██║ █╗ ██║█████╗ ██████╔╝██║ ██║██║
open-webui | ██║ ██║██╔═══╝ ██╔══╝ ██║╚██╗██║ ██║███╗██║██╔══╝ ██╔══██╗██║ ██║██║
open-webui | ╚██████╔╝██║ ███████╗██║ ╚████║ ╚███╔███╔╝███████╗██████╔╝╚██████╔╝██║
open-webui | ╚═════╝ ╚═╝ ╚══════╝╚═╝ ╚═══╝ ╚══╝╚══╝ ╚══════╝╚═════╝ ╚═════╝ ╚═╝
open-webui |
open-webui |
open-webui | v0.6.41 - building the best AI user interface.
open-webui |
open-webui | https://github.com/open-webui/open-webui
open-webui |
Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 351477.99it/s]
open-webui | INFO: Started server process [1]
open-webui | INFO: Waiting for application startup.
open-webui | 2025-12-23 10:12:42.231 | INFO | open_webui.utils.logger:start_logger:162 - GLOBAL_LOG_LEVEL: INFO
open-webui | 2025-12-23 10:12:42.231 | INFO | open_webui.main:lifespan:579 - Installing external dependencies of functions and tools...
open-webui | 2025-12-23 10:12:42.238 | INFO | open_webui.utils.plugin:install_frontmatter_requirements:283 - No requirements found in frontmatter.
llm-scaler-vllm | (APIServer pid=1) INFO 12-23 09:12:45 [__init__.py:743] Resolved architecture: Glm4vForConditionalGeneration
llm-scaler-vllm | (APIServer pid=1) WARNING 12-23 09:12:45 [_logger.py:68] Casting torch.bfloat16 to torch.float16.
llm-scaler-vllm | (APIServer pid=1) `torch_dtype` is deprecated! Use `dtype` instead!
llm-scaler-vllm | (APIServer pid=1) INFO 12-23 09:12:45 [__init__.py:1816] Using max model len 8192
llm-scaler-vllm | (APIServer pid=1) Traceback (most recent call last):
llm-scaler-vllm | (APIServer pid=1) File "<frozen runpy>", line 198, in _run_module_as_main
llm-scaler-vllm | (APIServer pid=1) File "<frozen runpy>", line 88, in _run_code
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 2011, in <module>
llm-scaler-vllm | (APIServer pid=1) uvloop.run(run_server(args))
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
llm-scaler-vllm | (APIServer pid=1) return __asyncio.run(
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
llm-scaler-vllm | (APIServer pid=1) return runner.run(main)
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
llm-scaler-vllm | (APIServer pid=1) return self._loop.run_until_complete(task)
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
llm-scaler-vllm | (APIServer pid=1) return await main
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 1941, in run_server
llm-scaler-vllm | (APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 1961, in run_server_worker
llm-scaler-vllm | (APIServer pid=1) async with build_async_engine_client(
llm-scaler-vllm | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
llm-scaler-vllm | (APIServer pid=1) return await anext(self.gen)
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 179, in build_async_engine_client
llm-scaler-vllm | (APIServer pid=1) async with build_async_engine_client_from_engine_args(
llm-scaler-vllm | (APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
llm-scaler-vllm | (APIServer pid=1) return await anext(self.gen)
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/entrypoints/openai/api_server.py", line 205, in build_async_engine_client_from_engine_args
llm-scaler-vllm | (APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context)
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/engine/arg_utils.py", line 1337, in create_engine_config
llm-scaler-vllm | (APIServer pid=1) speculative_config = self.create_speculative_config(
llm-scaler-vllm | (APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/engine/arg_utils.py", line 1064, in create_speculative_config
llm-scaler-vllm | (APIServer pid=1) from vllm.transformers_utils.configs.speculators.base import (
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/transformers_utils/configs/__init__.py", line 28, in <module>
llm-scaler-vllm | (APIServer pid=1) from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
llm-scaler-vllm | (APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm-0.10.3.dev0+g01efc7ef7.d20251125.xpu-py3.12-linux-x86_64.egg/vllm/transformers_utils/configs/qwen3_next.py", line 21, in <module>
llm-scaler-vllm | (APIServer pid=1) from transformers.modeling_rope_utils import rope_config_validation
llm-scaler-vllm | (APIServer pid=1) ImportError: cannot import name 'rope_config_validation' from 'transformers.modeling_rope_utils' (/usr/local/lib/python3.12/dist-packages/transformers/modeling_rope_utils.py)
llm-scaler-vllm exited with code 1 (restarting)