Skip to content

Commit

Permalink
move qwen2vision model to mlx_clip
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Jan 28, 2025
1 parent 016f8a9 commit d57e23e
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 357 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ build/
mpi_hosts
run_mpirun.sh
test.py
test_mpi4.py
test_mpi4.py
src/mlx-clip
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

1. install dependencies

- for mlx (macos arm): `pip install -e ".[mlx]"`
- for mlx (macos arm): `pip install -e ".[mlx]" && pip install -r requirements/mlx.txt`
- for nvidia: `pip install -e ".[torch]"`

2. run server
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/run_async_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,5 @@ async def main(messages_list: List[List[Dict[str, Any]]]):


if __name__ == "__main__":
asyncio.run(main(llm_message()))
# asyncio.run(main(mllm_message()))
# asyncio.run(main(llm_message()))
asyncio.run(main(mllm_message()))
37 changes: 3 additions & 34 deletions examples/run_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def parse_args():
choices=["AUTO", "TORCH", "VLLM", "XFormers"],
help="Attention backend if backend is TORCH",
)
parser.add_argument("--model_path", type=str, default="mlx-community/Llama-3.2-1B-Instruct-4bit")
parser.add_argument("--model_path", type=str, default="Qwen/Qwen2-VL-2B-Instruct")
return parser.parse_args()


Expand Down Expand Up @@ -103,43 +103,12 @@ def mllm_message():
async def llm_generate(args, messages):
engine = init_engine(args.model_path)
await engine.start()
messages = [{"role": "user", "content": "Hello, how are you?"}]
# messages = [
# {
# "role": "system",
# "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
# },
# {"role": "user", "content": "hello"},
# ]
openai_serving_chat = OpenAIServing(engine, args)

# for _ in range(3):
request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100)
response = await openai_serving_chat.create_chat_completion(request, None)
print(response)

messages = [
{"role": "user", "content": "Hello, how are you?"},
{
"role": "assistant",
"content": "Hello! I'm Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with any questions or tasks you might have. How can I help you today?",
},
{"role": "user", "content": "今天天气怎么样?"},
]
# messages = [
# {
# "role": "system",
# "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.",
# },
# {"role": "user", "content": "hello"},
# {"role": "assistant", "content": "Hello! How can I assist you today?"},
# {"role": "user", "content": "今年天气怎么样"},
# ]
for _ in range(3):
request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100)
response = await openai_serving_chat.create_chat_completion(request, None)
print(response)


async def image_generate(args):
prompt = "germanic romanticism painting of an obscure winter forest in a geocore landscape. Ambient landscape lighting, heavy shading, crystal night sky, stunning stars, topography"
Expand All @@ -162,6 +131,6 @@ async def image_generate(args):

if __name__ == "__main__":
args = parse_args()
asyncio.run(llm_generate(args, llm_message()))
# asyncio.run(llm_generate(args, mllm_message()))
# asyncio.run(llm_generate(args, llm_message()))
asyncio.run(llm_generate(args, mllm_message()))
# asyncio.run(image_generate(args))
6 changes: 3 additions & 3 deletions examples/run_single_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
MODEL_PATH=Qwen/Qwen2-VL-2B-Instruct
# MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=mlx-community/Qwen2.5-0.5B-Instruct-bf16
MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit
MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit
MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit
# MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit
# MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit
# MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit

tllm.server --model_path $MODEL_PATH --is_local --hostname localhost --client_size 1
3 changes: 2 additions & 1 deletion requirements/mlx.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
mlx
mlx_lm==0.19.2
mlx_lm==0.19.2
-e git+https://github.com/wnma3mz/mlx_clip.git#egg=mlx_clip
90 changes: 2 additions & 88 deletions tllm/models/mlx/qwen_vl/qwen_vl.py → tllm/models/mlx/qwen_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,98 +3,12 @@
import mlx.core as mx
import mlx.nn as nn
import numpy as np
from transformers import AutoConfig, AutoProcessor
from transformers import AutoProcessor

from tllm import DTYPE
from tllm.models.mlx.helper import quantization_func
from tllm.models.mlx.qwen_vl.layers import (
PatchEmbed,
PatchMerger,
VisionMlp,
VisionRotaryEmbedding,
VisionSdpaAttention,
)


class Qwen2VLVisionBlock(nn.Module):
def __init__(self, config: AutoConfig) -> None:
super().__init__()
self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)

self.attn = VisionSdpaAttention(config.embed_dim, num_heads=config.num_heads)
self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)

def __call__(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mx.array:
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
)
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
return hidden_states


class Qwen2VisionModel(nn.Module):
def __init__(self, config) -> None:
super().__init__()
self.spatial_merge_size = config.spatial_merge_size

self.patch_embed = PatchEmbed(
patch_size=config.patch_size,
temporal_patch_size=config.temporal_patch_size,
in_channels=config.in_channels,
embed_dim=config.embed_dim,
)

head_dim = config.embed_dim // config.num_heads
self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
self.blocks = [Qwen2VLVisionBlock(config) for _ in range(config.depth)]
self.merger = PatchMerger(
dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
)

def rot_pos_emb(self, grid_thw):
pos_ids = []

for thw in grid_thw:
t, h, w = thw.tolist()
hpos_ids = mx.repeat(mx.expand_dims(mx.arange(h), axis=1), w, axis=1)
hpos_ids = hpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
hpos_ids = hpos_ids.flatten()

wpos_ids = mx.repeat(mx.expand_dims(mx.arange(w), axis=0), h, axis=0)
wpos_ids = wpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
wpos_ids = wpos_ids.flatten()
pos_ids.append(mx.repeat(mx.stack([hpos_ids, wpos_ids], axis=-1), t, axis=1))
pos_ids = mx.concatenate(pos_ids, axis=0)
max_grid_size = grid_thw[:, 1:].max()
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
return rotary_pos_emb

def __call__(self, hidden_states: mx.array, grid_thw: mx.array) -> mx.array:
hidden_states = self.patch_embed(hidden_states)
rotary_pos_emb = self.rot_pos_emb(grid_thw)

repeated = mx.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0])
cu_seqlens = mx.cumsum(repeated)
cu_seqlens = mx.pad(cu_seqlens, pad_width=(1, 0)).tolist()
for blk in self.blocks:
hidden_states = blk(hidden_states.astype(DTYPE), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
return self.merger(hidden_states)

from mlx_clip.models.qwen2vision.qwen2vision_model import Qwen2VisionModel

class MLXQwen2VLForConditionalGeneration(nn.Module):
def __init__(self, config):
Expand Down
53 changes: 0 additions & 53 deletions tllm/models/mlx/qwen_vl/clip.py

This file was deleted.

Loading

0 comments on commit d57e23e

Please sign in to comment.