move qwen2vision model to mlx_clip

wnma3mz · Jan 28, 2025 · d57e23e · d57e23e
1 parent 016f8a9
commit d57e23e
Show file tree

Hide file tree

Showing 10 changed files with 16 additions and 357 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,5 @@ build/
 mpi_hosts
 run_mpirun.sh
 test.py
-test_mpi4.py
+test_mpi4.py
+src/mlx-clip
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 1. install dependencies
 
-- for mlx (macos arm):   `pip install -e ".[mlx]"`
+- for mlx (macos arm):  `pip install -e ".[mlx]" && pip install -r requirements/mlx.txt`
 - for nvidia: `pip install -e ".[torch]"`
 
 2. run server

diff --git a/benchmarks/run_async_requests.py b/benchmarks/run_async_requests.py
@@ -90,5 +90,5 @@ async def main(messages_list: List[List[Dict[str, Any]]]):
 
 
 if __name__ == "__main__":
-    asyncio.run(main(llm_message()))
-    # asyncio.run(main(mllm_message()))
+    # asyncio.run(main(llm_message()))
+    asyncio.run(main(mllm_message()))
diff --git a/examples/run_engine.py b/examples/run_engine.py
@@ -18,7 +18,7 @@ def parse_args():
         choices=["AUTO", "TORCH", "VLLM", "XFormers"],
         help="Attention backend if backend is TORCH",
     )
-    parser.add_argument("--model_path", type=str, default="mlx-community/Llama-3.2-1B-Instruct-4bit")
+    parser.add_argument("--model_path", type=str, default="Qwen/Qwen2-VL-2B-Instruct")
     return parser.parse_args()
 
 
@@ -103,43 +103,12 @@ def mllm_message():
 async def llm_generate(args, messages):
     engine = init_engine(args.model_path)
     await engine.start()
-    messages = [{"role": "user", "content": "Hello, how are you?"}]
-    # messages = [
-    #     {
-    #         "role": "system",
-    #         "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
-    #     },
-    #     {"role": "user", "content": "hello"},
-    # ]
     openai_serving_chat = OpenAIServing(engine, args)
 
-    # for _ in range(3):
     request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100)
     response = await openai_serving_chat.create_chat_completion(request, None)
     print(response)
 
-    messages = [
-        {"role": "user", "content": "Hello, how are you?"},
-        {
-            "role": "assistant",
-            "content": "Hello! I'm Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with any questions or tasks you might have. How can I help you today?",
-        },
-        {"role": "user", "content": "今天天气怎么样？"},
-    ]
-    # messages = [
-    #     {
-    #         "role": "system",
-    #         "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
-    #     },
-    #     {"role": "user", "content": "hello"},
-    #     {"role": "assistant", "content": "Hello! How can I assist you today?"},
-    #     {"role": "user", "content": "今年天气怎么样"},
-    # ]
-    for _ in range(3):
-        request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100)
-        response = await openai_serving_chat.create_chat_completion(request, None)
-        print(response)
-
 
 async def image_generate(args):
     prompt = "germanic romanticism painting of an obscure winter forest in a geocore landscape. Ambient landscape lighting, heavy shading, crystal night sky, stunning stars, topography"
@@ -162,6 +131,6 @@ async def image_generate(args):
 
 if __name__ == "__main__":
     args = parse_args()
-    asyncio.run(llm_generate(args, llm_message()))
-    # asyncio.run(llm_generate(args, mllm_message()))
+    # asyncio.run(llm_generate(args, llm_message()))
+    asyncio.run(llm_generate(args, mllm_message()))
     # asyncio.run(image_generate(args))
diff --git a/examples/run_single_server.sh b/examples/run_single_server.sh
@@ -2,8 +2,8 @@
 MODEL_PATH=Qwen/Qwen2-VL-2B-Instruct
 # MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
 # MODEL_PATH=mlx-community/Qwen2.5-0.5B-Instruct-bf16
-MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit
-MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit
-MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit
+# MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit
+# MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit
+# MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit
 
 tllm.server --model_path $MODEL_PATH --is_local --hostname localhost --client_size 1
diff --git a/requirements/mlx.txt b/requirements/mlx.txt
@@ -1,2 +1,3 @@
 mlx
-mlx_lm==0.19.2
+mlx_lm==0.19.2
+-e git+https://github.com/wnma3mz/mlx_clip.git#egg=mlx_clip
diff --git a/tllm/models/mlx/qwen_vl/qwen_vl.py → tllm/models/mlx/qwen_vl.py b/tllm/models/mlx/qwen_vl/qwen_vl.py → tllm/models/mlx/qwen_vl.py
@@ -3,98 +3,12 @@
 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np
-from transformers import AutoConfig, AutoProcessor
+from transformers import AutoProcessor
 
 from tllm import DTYPE
 from tllm.models.mlx.helper import quantization_func
-from tllm.models.mlx.qwen_vl.layers import (
-    PatchEmbed,
-    PatchMerger,
-    VisionMlp,
-    VisionRotaryEmbedding,
-    VisionSdpaAttention,
-)
-
-
-class Qwen2VLVisionBlock(nn.Module):
-    def __init__(self, config: AutoConfig) -> None:
-        super().__init__()
-        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
-        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
-        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
-
-        self.attn = VisionSdpaAttention(config.embed_dim, num_heads=config.num_heads)
-        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
-
-    def __call__(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mx.array:
-        hidden_states = hidden_states + self.attn(
-            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
-        )
-        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
-        return hidden_states
-
-
-class Qwen2VisionModel(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
-        self.spatial_merge_size = config.spatial_merge_size
-
-        self.patch_embed = PatchEmbed(
-            patch_size=config.patch_size,
-            temporal_patch_size=config.temporal_patch_size,
-            in_channels=config.in_channels,
-            embed_dim=config.embed_dim,
-        )
-
-        head_dim = config.embed_dim // config.num_heads
-        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
-        self.blocks = [Qwen2VLVisionBlock(config) for _ in range(config.depth)]
-        self.merger = PatchMerger(
-            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
-        )
-
-    def rot_pos_emb(self, grid_thw):
-        pos_ids = []
-
-        for thw in grid_thw:
-            t, h, w = thw.tolist()
-            hpos_ids = mx.repeat(mx.expand_dims(mx.arange(h), axis=1), w, axis=1)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
-            hpos_ids = hpos_ids.flatten()
-
-            wpos_ids = mx.repeat(mx.expand_dims(mx.arange(w), axis=0), h, axis=0)
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
-            wpos_ids = wpos_ids.flatten()
-            pos_ids.append(mx.repeat(mx.stack([hpos_ids, wpos_ids], axis=-1), t, axis=1))
-        pos_ids = mx.concatenate(pos_ids, axis=0)
-        max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
-
-    def __call__(self, hidden_states: mx.array, grid_thw: mx.array) -> mx.array:
-        hidden_states = self.patch_embed(hidden_states)
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
-
-        repeated = mx.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0])
-        cu_seqlens = mx.cumsum(repeated)
-        cu_seqlens = mx.pad(cu_seqlens, pad_width=(1, 0)).tolist()
-        for blk in self.blocks:
-            hidden_states = blk(hidden_states.astype(DTYPE), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
-        return self.merger(hidden_states)
 
+from mlx_clip.models.qwen2vision.qwen2vision_model import Qwen2VisionModel
 
 class MLXQwen2VLForConditionalGeneration(nn.Module):
     def __init__(self, config):

diff --git a/tllm/models/mlx/qwen_vl/clip.py b/tllm/models/mlx/qwen_vl/clip.py