diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..6841030cf --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..be422ca65 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..6841030cf --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..be422ca65 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..cdab6e86d --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..447a31028 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..cdab6e86d --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..447a31028 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..32f59a0f6 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..a2aae92fe --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 7}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..32f59a0f6 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..a2aae92fe --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 7}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..3c76df373 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 9}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..a107a0806 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..3c76df373 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 9}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..a107a0806 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=2,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..99e5b9abf --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..3254a4899 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 32, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..99e5b9abf --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..3254a4899 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 32, "num_warps": 4, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..1cfd553d5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..e2aff9029 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 11}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..1cfd553d5 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 2}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..e2aff9029 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 11}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..2bf2f830d --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..7a1bb8457 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..2bf2f830d --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..7a1bb8457 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..4bd46b275 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..2ccdd794f --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..4bd46b275 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 3}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..2ccdd794f --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..8125c287b --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 9}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..902b7c11e --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..8125c287b --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 9}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..902b7c11e --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 3}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 5}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..b248fd7d1 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..749e3d607 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json new file mode 100644 index 000000000..b248fd7d1 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_4090_D.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json new file mode 100644 index 000000000..749e3d607 --- /dev/null +++ b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,max_batch_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_GeForce_RTX_5090.json @@ -0,0 +1 @@ +{"4096": {"8": {"BLOCK_N": 64, "num_warps": 4, "num_stages": 4}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}}} \ No newline at end of file diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 613722787..1a2adcaa0 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -645,6 +645,7 @@ async def _wait_to_token_package( ) self.metric_client.histogram_observe("lightllm_request_generated_tokens", out_token_counter) self.metric_client.counter_inc("lightllm_request_success") + self.metric_client.histogram_observe("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step) return req_status.out_token_info_list.clear() diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py index 4929a3a52..bc97df6c9 100644 --- a/lightllm/server/httpserver_for_pd_master/manager.py +++ b/lightllm/server/httpserver_for_pd_master/manager.py @@ -320,6 +320,7 @@ async def _wait_to_token_package( group_request_id = sampling_params.group_request_id unfinished_count = sampling_params.best_of is_first_token = True + sub_req_id_to_mtp_accepted_token_num: Dict[int, int] = {} client_mode: NodeRole = NodeRole(d_node.mode) @@ -333,6 +334,7 @@ async def _wait_to_token_package( prompt_tokens = metadata["prompt_tokens"] out_token_counter += 1 + sub_req_id_to_mtp_accepted_token_num[sub_req_id] = metadata.get("mtp_accepted_token_num", 0) if is_first_token: first_token_cost_ms = (time.time() - start_time) * 1000 is_first_token = False @@ -351,6 +353,9 @@ async def _wait_to_token_package( x_session_id = request.headers.get("X-Session-Id", "") prompt_cache_len = metadata.pop("prompt_cache_len", 0) prompt_cache_ratio = prompt_cache_len / prompt_tokens + mtp_avg_token_per_step = out_token_counter / max( + (out_token_counter - sum(sub_req_id_to_mtp_accepted_token_num.values())), 1 + ) format_start_time = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S") logger.info( f"X-Request-Id:{x_request_id} " @@ -361,6 +366,7 @@ async def _wait_to_token_package( f"prompt_token_num:{prompt_tokens} " f"prompt_cache_len:{prompt_cache_len} " f"prompt_cache_ratio:{prompt_cache_ratio} " + f"mtp_avg_token_per_step:{mtp_avg_token_per_step} " ) self.metric_client.histogram_observe("lightllm_request_inference_duration", total_cost_time_ms / 1000.0) self.metric_client.histogram_observe( @@ -369,6 +375,7 @@ async def _wait_to_token_package( self.metric_client.histogram_observe("lightllm_request_first_token_duration", first_token_cost_ms / 1000.0) self.metric_client.histogram_observe("lightllm_request_generated_tokens", out_token_counter) self.metric_client.counter_inc("lightllm_request_success") + self.metric_client.histogram_observe("lightllm_request_mtp_avg_token_per_step", mtp_avg_token_per_step) return async def abort( diff --git a/lightllm/server/metrics/metrics.py b/lightllm/server/metrics/metrics.py index 8a2d7bc3e..4aa6c9d64 100644 --- a/lightllm/server/metrics/metrics.py +++ b/lightllm/server/metrics/metrics.py @@ -26,6 +26,7 @@ "lightllm_cache_length": "Length of tokens which hit prompt cache", "lightllm_cache_ratio": "cache length / input_length", "lightllm_batch_current_max_tokens": "dynamic max token used for current batch", + "lightllm_request_mtp_avg_token_per_step": "Average number of tokens per step", } @@ -93,6 +94,7 @@ def init_metrics(self, args): ratio_buckets = [(i + 1) / 10.0 for i in range(-1, 10)] self.create_histogram("lightllm_cache_ratio", ratio_buckets) + self.create_histogram("lightllm_request_mtp_avg_token_per_step", self.duration_buckets) def create_histogram(self, name, buckets, labelnames=None): if labelnames is None: