Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/parallax/server/cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def _calculate_linear_cache_bytes(self, dtype_size: int) -> int:
def _calculate_num_blocks(self, cache_memory_fraction: float, dtype: mx.Dtype) -> int:
device_info = mx.metal.device_info()
total_mem = device_info["max_recommended_working_set_size"]
current_mem = mx.metal.get_active_memory()
current_mem = mx.get_active_memory()
free_mem = total_mem - current_mem
available_for_cache = free_mem * cache_memory_fraction

Expand Down
2 changes: 1 addition & 1 deletion src/parallax/server/executor/base_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
max_batch_size: Optional[int] = 8,
max_sequence_length: Optional[int] = None,
# Controlling perfill / decode ratio
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
prefill_priority: int = 0,
micro_batch_ratio: int = 2,
scheduler_wait_ms: int = 500,
Expand Down
4 changes: 2 additions & 2 deletions src/parallax/server/executor/mlx_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ def __init__(
max_sequence_length: Optional[int] = None,
max_tokens_in_kv_pool: Optional[int] = None,
# Controlling perfill / decode ratio
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
prefill_priority: int = 0,
micro_batch_ratio: int = 2,
scheduler_wait_ms: int = 500,
request_timeout_s: Optional[int] = 600,
# Metrics Configs
layer_latency_update_every: int = 4096,
# KV Cache Configs
kv_block_size: int = 64,
kv_block_size: int = 32,
kv_cache_memory_fraction: float = 0.8,
enable_prefix_cache: Optional[bool] = False,
# Communication Configs
Expand Down
2 changes: 1 addition & 1 deletion src/parallax/server/executor/sglang_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
max_sequence_length: Optional[int] = None,
max_tokens_in_kv_pool: Optional[int] = None,
# Controlling perfill / decode ratio
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
prefill_priority: int = 0,
micro_batch_ratio: int = 2,
scheduler_wait_ms: int = 500,
Expand Down
2 changes: 1 addition & 1 deletion src/parallax/server/executor/vllm_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
max_sequence_length: Optional[int] = None,
max_tokens_in_kv_pool: Optional[int] = None,
# Controlling perfill / decode ratio
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
prefill_priority: int = 0,
micro_batch_ratio: int = 2,
scheduler_wait_ms: int = 500,
Expand Down
5 changes: 1 addition & 4 deletions src/parallax/server/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class Scheduler:
def __init__(
self,
max_batch_size: int = 16,
max_num_tokens_per_batch: int = 4096,
max_num_tokens_per_batch: int = 16384,
scheduler_wait_ms: int = 200,
micro_batch_ratio: int = 2,
is_first_peer: bool = False,
Expand Down Expand Up @@ -158,9 +158,6 @@ def evict_request(self, request_id: str):
except Exception:
pass
else:
logger.warning(
f"Attempted to evict non-existent request {request_id}. It might have been already evicted."
)
return

def cancel_request(self, request_id: str):
Expand Down
6 changes: 3 additions & 3 deletions src/parallax/server/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def parse_args() -> argparse.Namespace:
)

parser.add_argument(
"--kv-block-size", type=int, default=64, help="Block size for KV cache management"
"--kv-block-size", type=int, default=32, help="Block size for KV cache management"
)

parser.add_argument(
Expand All @@ -120,7 +120,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--max-num-tokens-per-batch",
type=int,
default=1024,
default=16384,
help="Maximum number of tokens in a batch",
)

Expand All @@ -133,7 +133,7 @@ def parse_args() -> argparse.Namespace:
)

parser.add_argument(
"--micro-batch-ratio", type=int, default=2, help="Micro batch ratio for scheduling"
"--micro-batch-ratio", type=int, default=1, help="Micro batch ratio for scheduling"
)

parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion src/parallax/sglang/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def initialize_sgl_model_runner(
attention_backend: str,
kv_block_size: int,
moe_runner_backend: str,
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
enable_lora: Optional[bool] = False,
max_lora_rank: Optional[int] = None,
lora_target_modules: Optional[List[str]] = None,
Expand Down
2 changes: 1 addition & 1 deletion src/parallax/vllm/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def initialize_vllm_model_runner(
kv_cache_memory_fraction: float,
attention_backend: str,
kv_block_size: int,
max_num_tokens_per_batch: int = 1024,
max_num_tokens_per_batch: int = 16384,
dtype: str = "float16",
**kwargs,
) -> Tuple[ParallaxVLLMModelRunner, Dict, Any]:
Expand Down