bump up version and gate magic-wand version (#267)

1. bump up main to 0.4.0 2. gated nm-magic-wand version to ~=<nm-vllm version> for the release; nightly still installs the latest nm-magic-wand-nightly --------- Co-authored-by: dhuangnm <[email protected]>
neuralmagic · May 29, 2024 · 00bf0c8 · 00bf0c8 · github-actions · May 29, 2024
1 parent f687019
commit 00bf0c8
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 2 deletions.
diff --git a/setup.py b/setup.py
@@ -415,7 +415,9 @@ def _read_requirements(filename: str) -> List[str]:
 _sparsity_deps = ["nm-magic-wand-nightly"]
 nm_release_type = os.getenv(NM_RELEASE_TYPE)
 if nm_release_type == 'RELEASE':
-    _sparsity_deps = ["nm-magic-wand"]
+    # gate magic-wand version in nm-vllm for release; for nightly, we always install the latest
+    magic_wand_version_dep = "0.2.2"
+    _sparsity_deps = [f"nm-magic-wand~={magic_wand_version_dep}"]
 
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -10,7 +10,7 @@
 from vllm.sampling_params import SamplingParams
 
 # UPSTREAM SYNC: use the current downstream.
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 
 __all__ = [
     "LLM",
Benchmark suite	Current: `00bf0c8`	Previous: `a6b9443`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.8761328435196507` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1488.4350119115459` tokens/s
Benchmark suite	Current: `00bf0c8`	Previous: `a6b9443`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.9.17 (main, May 10 2024, 13:34:20) \n[GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.7874083032081627` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.9.17 (main, May 10 2024, 13:34:20) \n[GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1454.3647884319344` tokens/s
Benchmark suite	Current: `00bf0c8`	Previous: `a6b9443`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.8324690592210886` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1471.668118740898` tokens/s
Benchmark suite	Current: `00bf0c8`	Previous: `a6b9443`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.8.17 (default, May 10 2024, 13:27:09) \n[GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.797454689582896` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.4.0", "python_version": "3.8.17 (default, May 10 2024, 13:27:09) \n[GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1458.222600799832` tokens/s