GradientHQ · TianyiZhao1437 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/src/parallax/server/cache_manager.py b/src/parallax/server/cache_manager.py
@@ -226,7 +226,7 @@ def _calculate_linear_cache_bytes(self, dtype_size: int) -> int:
     def _calculate_num_blocks(self, cache_memory_fraction: float, dtype: mx.Dtype) -> int:
         device_info = mx.metal.device_info()
         total_mem = device_info["max_recommended_working_set_size"]
-        current_mem = mx.metal.get_active_memory()
+        current_mem = mx.get_active_memory()
         free_mem = total_mem - current_mem
         available_for_cache = free_mem * cache_memory_fraction
 

diff --git a/src/parallax/server/executor/base_executor.py b/src/parallax/server/executor/base_executor.py
@@ -64,7 +64,7 @@ def __init__(
         max_batch_size: Optional[int] = 8,
         max_sequence_length: Optional[int] = None,
         # Controlling perfill / decode ratio
-        max_num_tokens_per_batch: int = 1024,
+        max_num_tokens_per_batch: int = 16384,
         prefill_priority: int = 0,
         micro_batch_ratio: int = 2,
         scheduler_wait_ms: int = 500,

diff --git a/src/parallax/server/executor/mlx_executor.py b/src/parallax/server/executor/mlx_executor.py
@@ -45,15 +45,15 @@ def __init__(
         max_sequence_length: Optional[int] = None,
         max_tokens_in_kv_pool: Optional[int] = None,
         # Controlling perfill / decode ratio
-        max_num_tokens_per_batch: int = 1024,
+        max_num_tokens_per_batch: int = 16384,
         prefill_priority: int = 0,
         micro_batch_ratio: int = 2,
         scheduler_wait_ms: int = 500,
         request_timeout_s: Optional[int] = 600,
         # Metrics Configs
         layer_latency_update_every: int = 4096,
         # KV Cache Configs
-        kv_block_size: int = 64,
+        kv_block_size: int = 32,
         kv_cache_memory_fraction: float = 0.8,
         enable_prefix_cache: Optional[bool] = False,
         # Communication Configs

diff --git a/src/parallax/server/executor/sglang_executor.py b/src/parallax/server/executor/sglang_executor.py
@@ -47,7 +47,7 @@ def __init__(
         max_sequence_length: Optional[int] = None,
         max_tokens_in_kv_pool: Optional[int] = None,
         # Controlling perfill / decode ratio
-        max_num_tokens_per_batch: int = 1024,
+        max_num_tokens_per_batch: int = 16384,
         prefill_priority: int = 0,
         micro_batch_ratio: int = 2,
         scheduler_wait_ms: int = 500,

diff --git a/src/parallax/server/executor/vllm_executor.py b/src/parallax/server/executor/vllm_executor.py
@@ -44,7 +44,7 @@ def __init__(
         max_sequence_length: Optional[int] = None,
         max_tokens_in_kv_pool: Optional[int] = None,
         # Controlling perfill / decode ratio
-        max_num_tokens_per_batch: int = 1024,
+        max_num_tokens_per_batch: int = 16384,
         prefill_priority: int = 0,
         micro_batch_ratio: int = 2,
         scheduler_wait_ms: int = 500,

diff --git a/src/parallax/server/scheduler.py b/src/parallax/server/scheduler.py
@@ -41,7 +41,7 @@ class Scheduler:
     def __init__(
         self,
         max_batch_size: int = 16,
-        max_num_tokens_per_batch: int = 4096,
+        max_num_tokens_per_batch: int = 16384,
         scheduler_wait_ms: int = 200,
         micro_batch_ratio: int = 2,
         is_first_peer: bool = False,
@@ -158,9 +158,6 @@ def evict_request(self, request_id: str):
             except Exception:
                 pass
         else:
-            logger.warning(
-                f"Attempted to evict non-existent request {request_id}. It might have been already evicted."
-            )
             return
 
     def cancel_request(self, request_id: str):

diff --git a/src/parallax/server/server_args.py b/src/parallax/server/server_args.py
@@ -102,7 +102,7 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument(
-        "--kv-block-size", type=int, default=64, help="Block size for KV cache management"
+        "--kv-block-size", type=int, default=32, help="Block size for KV cache management"
     )
 
     parser.add_argument(
@@ -120,7 +120,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--max-num-tokens-per-batch",
         type=int,
-        default=1024,
+        default=16384,
         help="Maximum number of tokens in a batch",
     )
 
@@ -133,7 +133,7 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument(
-        "--micro-batch-ratio", type=int, default=2, help="Micro batch ratio for scheduling"
+        "--micro-batch-ratio", type=int, default=1, help="Micro batch ratio for scheduling"
     )
 
     parser.add_argument(

diff --git a/src/parallax/sglang/model_runner.py b/src/parallax/sglang/model_runner.py
@@ -259,7 +259,7 @@ def initialize_sgl_model_runner(
     attention_backend: str,
     kv_block_size: int,
     moe_runner_backend: str,
-    max_num_tokens_per_batch: int = 1024,
+    max_num_tokens_per_batch: int = 16384,
     enable_lora: Optional[bool] = False,
     max_lora_rank: Optional[int] = None,
     lora_target_modules: Optional[List[str]] = None,

diff --git a/src/parallax/vllm/model_runner.py b/src/parallax/vllm/model_runner.py
@@ -334,7 +334,7 @@ def initialize_vllm_model_runner(
     kv_cache_memory_fraction: float,
     attention_backend: str,
     kv_block_size: int,
-    max_num_tokens_per_batch: int = 1024,
+    max_num_tokens_per_batch: int = 16384,
     dtype: str = "float16",
     **kwargs,
 ) -> Tuple[ParallaxVLLMModelRunner, Dict, Any]: