add setup

wnma3mz · Dec 21, 2024 · d706016 · d706016
1 parent de6cf52
commit d706016
Show file tree

Hide file tree

Showing 20 changed files with 218 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ __pycache__
 weights/
 .DS_Store
 *.png
-*.pt
+*.pt
+build/
+*.egg-info/
diff --git a/README.md b/README.md
@@ -4,45 +4,35 @@
 
 ### QuickStart
 
-1. download model from: https://huggingface.co/mlx-community/Llama-3.2-1B-Instruct-bf16
+1. install dependencies
 
-2. install dependencies
+- for mlx (macos arm):   `pip install -e ".[mlx]"`
+- for nvidia: `pip install -e ".[torch]"`
 
-- for mlx:   `pip install -r requirements-mlx.txt`
-- for nvidia: `pip install -r requirements-cuda.txt`
-- for intel: `pip install -r requirements.txt`
+2. run server
 
-3. run server 
+   2.1 (no communication)
 
-    3.1 (no communication)
+   ```bash
+   tllm.server --model_path mlx-community/Llama-3.2-1B-Instruct-4bit --is_local
+   ```
 
-    - edit `examples/run_single_server.sh`
+   2.2 (with communication)
 
-    ```bash
-    bash examples/run_single_server.sh
-    ```
+   ```bash
+   # first in one terminal
+   tllm.server --model_path mlx-community/Llama-3.2-1B-Instruct-4bit --hostname $YOUR_IP
 
-    3.2 (with communication)
+   # in another terminal
+   tllm.client --hostname $YOUR_IP
+   ```
+3. testing
 
-    - edit `examples/run_client.sh`
-
-    - edit `examples/run_server.sh`
-
-    ```bash
-    # first in one terminal
-    bash examples/run_server.sh
-
-    # in another terminal
-    bash examples/run_client.sh
-    ```
-
-4. testing
-
-```python
-python benchmarks/run_async_requests.py
+```bash
+python3 benchmarks/run_async_requests.py
 ```
 
-### Config
+### More Details
 
 In `examples/config.json`
 
@@ -69,34 +59,55 @@ In `examples/config.json`
 
 ### Features
 
-- [x] Support Multi-Requests
-- [x] Engine
-    - [x] mlx
-    - [x] torch
-    - [ ] tinygrad
-        - [ ] Multi-Request
-        - [ ] Jit
-        - [ ] Pipeline
-- [x] Communication
-    - [x] grpc
-    - [x] Auto Find Node
-        - [x] Simple Get Ip
-        - [x] Test Ping
-- [x] Attention
-    - [x] xformers
-    - [x] flash-attn
-    - [ ] PageAttention
+- [X] Support Multi-Requests
+- [X] Engine
+  - [X] mlx
+  - [X] torch
+  - [ ] tinygrad
+    - [ ] Multi-Request
+    - [ ] Jit
+    - [ ] Pipeline
+- [X] Communication
+  - [X] grpc
+  - [X] Auto Find Node
+    - [X] Simple Get Ip
+    - [X] Test Ping
+- [X] Attention
+  - [X] xformers
+  - [X] flash-attn
+  - [ ] PageAttention
 
 ### Performance
 
-For 1b
+In Mac Mini M4
 
-- mac mini m2
-![alt text](asserts/image.png)
+|                      | `mlx-community/Llama-3.2-1B-Instruct-4bit` | `mlx-community/Llama-3.2-1B-Instruct` | `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit` |
+| -------------------- | -------------------------------------------- | --------------------------------------- | ------------------------------------------------- |
+| Mac Mini M4          | 98.10 tok/s                                 | 35.45 tok/s                             | 20.68 tok/s                                       |
+| Mac Mini M4 + M3 Pro |                                              |                                         |                                                   |
+
+For `mlx-community/Llama-3.2-1B-Instruct-4bit`,
+
+![1734779816425](image/README/1734779816425.png)
 
+For `mlx-community/Llama-3.2-1B-Instruct`,
+
+![1734779931105](image/README/1734779931105.png)
+
+For `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit`,
+
+![1734779890405](image/README/1734779890405.png)
+
+old version
+
+For `mlx-community/Llama-3.2-1B-Instruct`
+
+- mac mini m2
+  ![alt text](asserts/image.png)
 - m3 pro
-![alt text](asserts/image-1.png)
+  ![alt text](asserts/image-1.png)
 
 for 8b
-- m3 pro (layer=8) + mac mini m2 (layer=24) 
-![alt text](asserts/image-2.png)
+
+- m3 pro (layer=8) + mac mini m2 (layer=24)
+  ![alt text](asserts/image-2.png)
diff --git a/examples/run_client.sh b/examples/run_client.sh
@@ -2,8 +2,5 @@
 # master 的地址，请求分配模型的节点
 MASTER_URL=http://mac-mini:8022
 
-export OMP_NUM_THREADS=8;
-export PYTHONPATH="./":$PYTHONPATH;
-
-python3 -m tllm.entrypoints.handler.handler --master_addr $MASTER_URL --is_debug
-# python3 -m tllm.entrypoints.handler.handler --master_addr $MASTER_URL --is_debug --config examples/config_one.json --client_idx 0
+tllm.client --master_addr $MASTER_URL --is_debug
+# tllm.client --master_addr $MASTER_URL --is_debug --config examples/config_one.json --client_idx 0
diff --git a/examples/run_engine.py b/examples/run_engine.py
@@ -28,9 +28,9 @@ def parse_args():
 
 @dataclass
 class Args:
-    # model_path: str = "/Users/lujianghu/Documents/Llama-3.2-3B-Instruct"
+    model_path: str = "/Users/lujianghu/Documents/Llama-3.2-1B-Instruct"
     # model_path: str = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
-    model_path: str = "mlx-community/Llama-3.2-1B-Instruct-4bit"
+    # model_path: str = "mlx-community/Llama-3.2-1B-Instruct-4bit"
     # model_path: str = "/Users/lujianghu/Documents/flux/schnell_4bit"
     # model_path: str = "Qwen/Qwen2.5-0.5B-Instruct"
     # model_path: str = "Qwen/Qwen2-VL-2B-Instruct"
@@ -77,7 +77,7 @@ async def llm_generate(args, messages):
     messages = [{"role": "user", "content": "Hello, how are you?"}]
     openai_serving_chat = OpenAIServing(engine, args)
 
-    request = ChatCompletionRequest(model="test", messages=messages)
+    request = ChatCompletionRequest(model="test", messages=messages, max_tokens=100)
     response = await openai_serving_chat.create_chat_completion(request, None)
     print(response)
 

diff --git a/examples/run_server.sh b/examples/run_server.sh
@@ -4,7 +4,5 @@
 MODEL_PATH=Qwen/Qwen2-VL-2B-Instruct
 MASTER_HOSTNAME=mac-mini
 
-export PYTHONPATH="./":$PYTHONPATH;
-
-python3 -m tllm.entrypoints.api_server --hostname $MASTER_HOSTNAME --model_path $MODEL_PATH --is_debug
-# python3 -m tllm.entrypoints.api_server --hostname $MASTER_HOSTNAME --model_path $MODEL_PATH --is_debug --config examples/config_one.json
+tllm.server --model_path $MODEL_PATH --hostname $MASTER_HOSTNAME --is_debug
+# tllm.server --hostname $MASTER_HOSTNAME --model_path $MODEL_PATH --is_debug --config examples/config_one.json
diff --git a/examples/run_single_server.sh b/examples/run_single_server.sh
@@ -3,8 +3,4 @@ MODEL_PATH=/Users/lujianghu/Documents/Llama-3.2-1B-Instruct
 # MODEL_PATH=Qwen/Qwen2-VL-2B-Instruct
 # MODEL_PATH=mlx-community/Meta-Llama-3.1-8B-Instruct-4bit
 
-export PYTHONPATH="./":$PYTHONPATH;
-
-python3 -m tllm.entrypoints.api_server --model_path $MODEL_PATH --is_local --is_debug
-
-
+tllm.server --model_path $MODEL_PATH --is_local --is_debug
diff --git a/flux_examples/run_client.sh b/flux_examples/run_client.sh
@@ -2,7 +2,4 @@
 # master 的地址，请求分配模型的节点
 MASTER_URL=http://mac-mini:8022
 
-export OMP_NUM_THREADS=8;
-export PYTHONPATH="./":$PYTHONPATH;
-
-python3 -m tllm.entrypoints.handler.handler --master_addr $MASTER_URL --is_debug
+tllm.client --master_addr $MASTER_URL --is_debug
diff --git a/flux_examples/run_server.sh b/flux_examples/run_server.sh
@@ -3,6 +3,4 @@
 MODEL_PATH=/Users/lujianghu/Documents/flux/schnell_4bit
 MASTER_HOSTNAME=mac-mini
 
-export PYTHONPATH="./":$PYTHONPATH;
-
-python3 -m tllm.entrypoints.api_server --hostname $MASTER_HOSTNAME --model_path $MODEL_PATH --client_size 1 --is_debug
+tllm.server --model_path $MODEL_PATH --hostname $MASTER_HOSTNAME --client_size 1 --is_debug
diff --git a/flux_examples/run_single_server.sh b/flux_examples/run_single_server.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 MODEL_PATH=/Users/lujianghu/Documents/flux/schnell_4bit
 
-export PYTHONPATH="./":$PYTHONPATH;
-python3 -m tllm.entrypoints.api_server --model_path $MODEL_PATH --client_size 1 --is_local --is_debug --is_image
+tllm.server --model_path $MODEL_PATH --client_size 1 --is_local --is_debug --is_image
 
 
diff --git a/image/README/1734779816425.png b/image/README/1734779816425.png
diff --git a/image/README/1734779890405.png b/image/README/1734779890405.png
diff --git a/image/README/1734779931105.png b/image/README/1734779931105.png
diff --git a/requirements-mlx.txt b/requirements-mlx.txt
@@ -13,8 +13,6 @@ gradio
 psutil
 grpcio==1.68.1
 lz4==4.3.3
-mlx
-mlx_lm==0.19.2
 protobuf==5.28.3
 pydantic==2.9.2
 transformers==4.46.0
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -0,0 +1,18 @@
+aiohttp
+fastapi
+numpy
+requests
+tabulate
+tqdm
+typing_extensions
+uvicorn
+websockets
+pillow
+huggingface_hub
+psutil
+gradio==5.4.0
+grpcio==1.68.1
+lz4==4.3.3
+protobuf==5.28.3
+pydantic==2.9.2
+transformers==4.46.0
diff --git a/requirements/mlx.txt b/requirements/mlx.txt
@@ -0,0 +1,2 @@
+mlx
+mlx_lm==0.19.2
diff --git a/requirements/torch.txt b/requirements/torch.txt
@@ -0,0 +1 @@
+vllm
diff --git a/setup.py b/setup.py
@@ -0,0 +1,68 @@
+from setuptools import find_packages, setup
+
+# 基础依赖
+install_requires = [
+    "aiohttp",
+    "fastapi",
+    "numpy",
+    "requests",
+    "tabulate",
+    "tqdm",
+    "typing_extensions",
+    "uvicorn",
+    "websockets",
+    "pillow",
+    "huggingface_hub",
+    "gradio",
+    "psutil",
+    "grpcio==1.68.1",
+    "lz4==4.3.3",
+    "protobuf==5.28.3",
+    "pydantic==2.9.2",
+    "transformers==4.46.0",
+]
+
+# 平台特定依赖
+mlx_requires = ["mlx", "mlx_lm==0.19.2"]
+
+tinygrad_requires = [
+    "tinygrad",
+]
+
+torch_requires = [
+    "vllm",
+]
+
+# 可选功能依赖
+extras_require = {
+    "mlx": mlx_requires,
+    # 'tinygrad': tinygrad_requires,
+    "torch": torch_requires,
+    "all": mlx_requires + torch_requires,  # 全部安装（可能在某些平台上无法使用）
+    "dev": [
+        "black",
+        "isort",
+    ],
+}
+
+setup(
+    name="tllm",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=install_requires,
+    extras_require=extras_require,
+    python_requires=">=3.9",  # 指定最低 Python 版本要求
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+    ],
+    entry_points={
+        "console_scripts": [
+            "tllm.server=tllm.entrypoints.api_server:main",
+            "tllm.client=tllm.entrypoints.handler.handler:main",
+        ],
+    },
+)
diff --git a/tllm/entrypoints/api_server.py b/tllm/entrypoints/api_server.py
@@ -220,6 +220,11 @@ async def run_server(args) -> None:
     await shutdown_task
 
 
-if __name__ == "__main__":
+def main():
+    global args
     args = parse_master_args()
     asyncio.run(run_server(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tllm/entrypoints/handler/handler.py b/tllm/entrypoints/handler/handler.py
@@ -184,6 +184,10 @@ async def run(args):
         await rpc_servicer.stop()
 
 
-if __name__ == "__main__":
+def main():
     args = parse_handler_args()
     asyncio.run(run(args))
+
+
+if __name__ == "__main__":
+    main()