diff --git a/.gitignore b/.gitignore index fd4dae7..c911dc1 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,5 @@ build/ mpi_hosts run_mpirun.sh test.py -test_mpi4.py \ No newline at end of file +test_mpi4.py +src/mlx-clip \ No newline at end of file diff --git a/README.md b/README.md index 79dfb33..6d180c4 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ 1. install dependencies -- for mlx (macos arm): `pip install -e ".[mlx]"` +- for mlx (macos arm): `pip install -e ".[mlx]" && pip install -r requirements/mlx.txt` - for nvidia: `pip install -e ".[torch]"` 2. run server diff --git a/benchmarks/run_async_requests.py b/benchmarks/run_async_requests.py index b2b5d23..1e1d6d2 100644 --- a/benchmarks/run_async_requests.py +++ b/benchmarks/run_async_requests.py @@ -90,5 +90,5 @@ async def main(messages_list: List[List[Dict[str, Any]]]): if __name__ == "__main__": - asyncio.run(main(llm_message())) - # asyncio.run(main(mllm_message())) + # asyncio.run(main(llm_message())) + asyncio.run(main(mllm_message())) diff --git a/examples/run_engine.py b/examples/run_engine.py index 9fea04b..42775ff 100644 --- a/examples/run_engine.py +++ b/examples/run_engine.py @@ -18,7 +18,7 @@ def parse_args(): choices=["AUTO", "TORCH", "VLLM", "XFormers"], help="Attention backend if backend is TORCH", ) - parser.add_argument("--model_path", type=str, default="mlx-community/Llama-3.2-1B-Instruct-4bit") + parser.add_argument("--model_path", type=str, default="Qwen/Qwen2-VL-2B-Instruct") return parser.parse_args() @@ -103,43 +103,12 @@ def mllm_message(): async def llm_generate(args, messages): engine = init_engine(args.model_path) await engine.start() - messages = [{"role": "user", "content": "Hello, how are you?"}] - # messages = [ - # { - # "role": "system", - # "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", - # }, - # {"role": "user", "content": "hello"}, - # ] openai_serving_chat = OpenAIServing(engine, args) - # for _ in range(3): request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100) response = await openai_serving_chat.create_chat_completion(request, None) print(response) - messages = [ - {"role": "user", "content": "Hello, how are you?"}, - { - "role": "assistant", - "content": "Hello! I'm Qwen, a large language model created by Alibaba Cloud. I'm here to assist you with any questions or tasks you might have. How can I help you today?", - }, - {"role": "user", "content": "今天天气怎么样?"}, - ] - # messages = [ - # { - # "role": "system", - # "content": "Given the following conversation, relevant context, and a follow up question, reply with an answer to the current question the user is asking. Return only your response to the question given the above information following the users instructions as needed.", - # }, - # {"role": "user", "content": "hello"}, - # {"role": "assistant", "content": "Hello! How can I assist you today?"}, - # {"role": "user", "content": "今年天气怎么样"}, - # ] - for _ in range(3): - request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100) - response = await openai_serving_chat.create_chat_completion(request, None) - print(response) - async def image_generate(args): prompt = "germanic romanticism painting of an obscure winter forest in a geocore landscape. Ambient landscape lighting, heavy shading, crystal night sky, stunning stars, topography" @@ -162,6 +131,6 @@ async def image_generate(args): if __name__ == "__main__": args = parse_args() - asyncio.run(llm_generate(args, llm_message())) - # asyncio.run(llm_generate(args, mllm_message())) + # asyncio.run(llm_generate(args, llm_message())) + asyncio.run(llm_generate(args, mllm_message())) # asyncio.run(image_generate(args)) diff --git a/examples/run_single_server.sh b/examples/run_single_server.sh index e7b456c..95148f7 100644 --- a/examples/run_single_server.sh +++ b/examples/run_single_server.sh @@ -2,8 +2,8 @@ MODEL_PATH=Qwen/Qwen2-VL-2B-Instruct # MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct # MODEL_PATH=mlx-community/Qwen2.5-0.5B-Instruct-bf16 -MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit -MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit -MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit +# MODEL_PATH=~/Documents/models--Qwen2.5-0.5B-Instruct-4bit +# MODEL_PATH=wnma3mz/DeepSeek-R1-Distill-Qwen-7B-4bit +# MODEL_PATH=~/Documents/DeepSeek-R1-Distill-Qwen-7B-4bit tllm.server --model_path $MODEL_PATH --is_local --hostname localhost --client_size 1 \ No newline at end of file diff --git a/requirements/mlx.txt b/requirements/mlx.txt index d747310..87a7e57 100644 --- a/requirements/mlx.txt +++ b/requirements/mlx.txt @@ -1,2 +1,3 @@ mlx -mlx_lm==0.19.2 \ No newline at end of file +mlx_lm==0.19.2 +-e git+https://github.com/wnma3mz/mlx_clip.git#egg=mlx_clip \ No newline at end of file diff --git a/tllm/models/mlx/qwen_vl/qwen_vl.py b/tllm/models/mlx/qwen_vl.py similarity index 51% rename from tllm/models/mlx/qwen_vl/qwen_vl.py rename to tllm/models/mlx/qwen_vl.py index 82a44f9..4cff93f 100644 --- a/tllm/models/mlx/qwen_vl/qwen_vl.py +++ b/tllm/models/mlx/qwen_vl.py @@ -3,98 +3,12 @@ import mlx.core as mx import mlx.nn as nn import numpy as np -from transformers import AutoConfig, AutoProcessor +from transformers import AutoProcessor from tllm import DTYPE from tllm.models.mlx.helper import quantization_func -from tllm.models.mlx.qwen_vl.layers import ( - PatchEmbed, - PatchMerger, - VisionMlp, - VisionRotaryEmbedding, - VisionSdpaAttention, -) - - -class Qwen2VLVisionBlock(nn.Module): - def __init__(self, config: AutoConfig) -> None: - super().__init__() - self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6) - self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6) - mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio) - - self.attn = VisionSdpaAttention(config.embed_dim, num_heads=config.num_heads) - self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act) - - def __call__(self, hidden_states, cu_seqlens, rotary_pos_emb) -> mx.array: - hidden_states = hidden_states + self.attn( - self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb - ) - hidden_states = hidden_states + self.mlp(self.norm2(hidden_states)) - return hidden_states - - -class Qwen2VisionModel(nn.Module): - def __init__(self, config) -> None: - super().__init__() - self.spatial_merge_size = config.spatial_merge_size - - self.patch_embed = PatchEmbed( - patch_size=config.patch_size, - temporal_patch_size=config.temporal_patch_size, - in_channels=config.in_channels, - embed_dim=config.embed_dim, - ) - - head_dim = config.embed_dim // config.num_heads - self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) - self.blocks = [Qwen2VLVisionBlock(config) for _ in range(config.depth)] - self.merger = PatchMerger( - dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size - ) - - def rot_pos_emb(self, grid_thw): - pos_ids = [] - - for thw in grid_thw: - t, h, w = thw.tolist() - hpos_ids = mx.repeat(mx.expand_dims(mx.arange(h), axis=1), w, axis=1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - hpos_ids = hpos_ids.transpose(0, 2, 1, 3) - hpos_ids = hpos_ids.flatten() - - wpos_ids = mx.repeat(mx.expand_dims(mx.arange(w), axis=0), h, axis=0) - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ) - wpos_ids = wpos_ids.transpose(0, 2, 1, 3) - wpos_ids = wpos_ids.flatten() - pos_ids.append(mx.repeat(mx.stack([hpos_ids, wpos_ids], axis=-1), t, axis=1)) - pos_ids = mx.concatenate(pos_ids, axis=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) - rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) - return rotary_pos_emb - - def __call__(self, hidden_states: mx.array, grid_thw: mx.array) -> mx.array: - hidden_states = self.patch_embed(hidden_states) - rotary_pos_emb = self.rot_pos_emb(grid_thw) - - repeated = mx.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]) - cu_seqlens = mx.cumsum(repeated) - cu_seqlens = mx.pad(cu_seqlens, pad_width=(1, 0)).tolist() - for blk in self.blocks: - hidden_states = blk(hidden_states.astype(DTYPE), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb) - return self.merger(hidden_states) +from mlx_clip.models.qwen2vision.qwen2vision_model import Qwen2VisionModel class MLXQwen2VLForConditionalGeneration(nn.Module): def __init__(self, config): diff --git a/tllm/models/mlx/qwen_vl/clip.py b/tllm/models/mlx/qwen_vl/clip.py deleted file mode 100644 index f69656b..0000000 --- a/tllm/models/mlx/qwen_vl/clip.py +++ /dev/null @@ -1,53 +0,0 @@ -# coding: utf-8 -# Modified by https://github.com/ml-explore/mlx-examples/blob/main/clip/model.py - -from dataclasses import dataclass - -import mlx.core as mx -import mlx.nn as nn - -from tllm.models.mlx.llama import Decoder -from tllm.models.mlx.qwen_vl.layers import VisionEmbeddings - -# only vision model - - -@dataclass -class CLIPVisionConfig: - num_hidden_layers: int - hidden_size: int - intermediate_size: int - num_attention_heads: int - num_channels: int - image_size: int - patch_size: int - layer_norm_eps: float - - -class ClipVisionModel(nn.Module): - """Implements the vision encoder transformer from CLIP.""" - - def __init__( - self, - config: CLIPVisionConfig, - first_num_layers: int = -1, - ): - super().__init__() - self.embeddings = VisionEmbeddings(config) - self.pre_layrnorm = nn.LayerNorm(config.hidden_size) - self.encoder = Decoder(config, 0, config.num_hidden_layers, False) - self.post_layernorm = nn.LayerNorm(config.hidden_size) - - def __call__( - self, - x: mx.array, - ) -> mx.array: - x = self.embeddings(x) - x = self.pre_layrnorm(x) - - for l in self.encoder.layers: - x = l(x, mask=None) - - # Extract token embedding - pooler_output = self.post_layernorm(x[:, 0, :]) - return pooler_output diff --git a/tllm/models/mlx/qwen_vl/layers.py b/tllm/models/mlx/qwen_vl/layers.py deleted file mode 100644 index 36400a6..0000000 --- a/tllm/models/mlx/qwen_vl/layers.py +++ /dev/null @@ -1,173 +0,0 @@ -import math -from typing import List - -import mlx.core as mx -import mlx.nn as nn - -from tllm import DTYPE - - -class VisionRotaryEmbedding(nn.Module): - def __init__(self, dim: int, theta: float = 10000.0) -> None: - super().__init__() - self._freqs = 1.0 / (theta ** (mx.arange(0, dim, 2) / dim)) - - def __call__(self, seqlen: int) -> mx.array: - seq = mx.arange(seqlen, dtype=self._freqs.dtype) - freqs = mx.outer(seq, self._freqs) - return freqs - - -class VisionEmbeddings(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.class_embedding = mx.zeros((config.hidden_size,)) - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - bias=False, - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches + 1 - self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - - def __call__(self, x: mx.array) -> mx.array: - batch_size = x.shape[0] - # Patchify using conv: - # [batch_size, sqrt(num_patches), sqrt(num_patches), embed_dim] - patch_embeddings = self.patch_embedding(x) - # [batch_size, num_patches, embed_dim] - patch_embeddings = mx.flatten(patch_embeddings, start_axis=1, end_axis=2) - embed_dim = patch_embeddings.shape[-1] - # Prepend embeddings - # [batch_size, 1, embed_dim] - cls_embeddings = mx.broadcast_to(self.class_embedding, (batch_size, 1, embed_dim)) - # [batch_size, num_patches + 1, embed_dim] - embeddings = mx.concatenate((cls_embeddings, patch_embeddings), axis=1) - # Add positional encoding - embeddings += self.position_embedding.weight - return embeddings - - -class PatchEmbed(nn.Module): - def __init__( - self, - patch_size: int = 14, - temporal_patch_size: int = 2, - in_channels: int = 3, - embed_dim: int = 1152, - ) -> None: - super().__init__() - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.in_channels = in_channels - self.embed_dim = embed_dim - - kernel_size = [temporal_patch_size, patch_size, patch_size] - self.proj = nn.Conv3d( - in_channels=in_channels, out_channels=embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False - ) - - def __call__(self, hidden_states: mx.array) -> mx.array: - hidden_states = hidden_states.reshape( - -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size - ) - # [out_ch, in_ch, n, h, w] -> [out_ch, n, h, w, in_ch] - hidden_states = mx.transpose(hidden_states, (0, 2, 3, 4, 1)) - hidden_states = self.proj(hidden_states).reshape(-1, self.embed_dim) - return hidden_states - - -class PatchMerger(nn.Module): - def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None: - super().__init__() - self.hidden_size = context_dim * (spatial_merge_size**2) - self.ln_q = nn.LayerNorm(context_dim, eps=1e-6) - self.mlp = [ - nn.Linear(self.hidden_size, self.hidden_size), - nn.GELU(), - nn.Linear(self.hidden_size, dim), - ] - - def __call__(self, x: mx.array) -> mx.array: - x = self.ln_q(x).reshape(-1, self.hidden_size) - for layer in self.mlp: - x = layer(x) - return x - - -def QuickGELUActivation(input: mx.array) -> mx.array: - return input * mx.sigmoid(1.702 * input) - - -class VisionMlp(nn.Module): - def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None: - super().__init__() - self.fc1 = nn.Linear(dim, hidden_dim) - # self.act = nn.SiLU() - self.act = QuickGELUActivation # for qwenvl - self.fc2 = nn.Linear(hidden_dim, dim) - - def __call__(self, x) -> mx.array: - return self.fc2(self.act(self.fc1(x))) - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return mx.concatenate((-x2, x1), axis=-1) - - -def apply_rotary_pos_emb_vision(tensor: mx.array, freqs: mx.array) -> mx.array: - orig_dtype = tensor.dtype - # tensor = tensor.float() - cos = freqs.cos() - sin = freqs.sin() - cos = mx.expand_dims(mx.tile(mx.expand_dims(cos, axis=1), (1, 1, 2)), axis=0) - sin = mx.expand_dims(mx.tile(mx.expand_dims(sin, axis=1), (1, 1, 2)), axis=0) - output = (tensor * cos) + (rotate_half(tensor) * sin) - output = output.astype(orig_dtype) - return output - - -class VisionSdpaAttention(nn.Module): - def __init__(self, dim: int, num_heads: int = 16) -> None: - super().__init__() - self.num_heads = num_heads - self.qkv = nn.Linear(dim, dim * 3, bias=True) - self.proj = nn.Linear(dim, dim) - - def __call__(self, hidden_states: mx.array, cu_seqlens: List[int], rotary_pos_emb: mx.array = None) -> mx.array: - seq_length = hidden_states.shape[0] - q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).transpose(1, 0, 2, 3) - q = apply_rotary_pos_emb_vision(mx.expand_dims(q, axis=0), rotary_pos_emb)[0] - k = apply_rotary_pos_emb_vision(mx.expand_dims(k, axis=0), rotary_pos_emb)[0] - - attention_mask = mx.zeros(shape=(1, seq_length, seq_length), dtype=mx.bool_) - for i in range(1, len(cu_seqlens)): - l, r = cu_seqlens[i - 1], cu_seqlens[i] - attention_mask[..., l:r, l:r] = True - attention_mask = mx.where(attention_mask, 0, -math.inf).astype(DTYPE) - q = q.transpose(1, 0, 2) - k = k.transpose(1, 0, 2) - v = v.transpose(1, 0, 2) - attn_output = mx.fast.scaled_dot_product_attention( - mx.expand_dims(q, axis=0), - mx.expand_dims(k, axis=0), - mx.expand_dims(v, axis=0), - scale=1 / math.sqrt(q.shape[-1]), - mask=attention_mask, - )[0] - attn_output = attn_output.transpose(1, 0, 2).reshape(seq_length, -1) - attn_output = self.proj(attn_output) - return attn_output diff --git a/tllm/models/register.py b/tllm/models/register.py index 12a7b65..ddc7f70 100644 --- a/tllm/models/register.py +++ b/tllm/models/register.py @@ -18,7 +18,7 @@ if BackendEnum.MLX == BACKEND: from tllm.models.mlx.llama import MLXLlamaForCausalLM, MLXLlamaModel from tllm.models.mlx.qwen import MLXQwen2ForCausalLM, MLXQwen2Model - from tllm.models.mlx.qwen_vl.qwen_vl import MLXQwen2VLForConditionalGeneration + from tllm.models.mlx.qwen_vl import MLXQwen2VLForConditionalGeneration MODEL_REGISTER.update({"LlamaForCausalLM": (MLXLlamaForCausalLM, MLXLlamaModel)}) MODEL_REGISTER.update({"Qwen2ForCausalLM": (MLXQwen2ForCausalLM, MLXQwen2Model)})