diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1962bfd69e21..9fb6f973b22d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,16 +37,16 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["vllm"] # By default we build both training and inference containers. Set true/false values to determine which to build. -build_training = true -build_inference = true +build_training = false +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures diff --git a/eks_infrastructure/rbac.yaml b/eks_infrastructure/rbac.yaml index f0ca6ebd7b1d..24050c05e94d 100644 --- a/eks_infrastructure/rbac.yaml +++ b/eks_infrastructure/rbac.yaml @@ -181,4 +181,80 @@ subjects: roleRef: kind: ClusterRole name: eks-cluster-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vllm-role + namespace: vllm +rules: +- apiGroups: + - "" + resources: + - pods + - pods/log + - services + - secrets + - persistentvolumeclaims + verbs: + - get + - list + - create + - delete +- apiGroups: + - "leaderworkerset.x-k8s.io" + resources: + - leaderworkersets + verbs: + - get + - create + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: vllm-cluster-role +rules: +- apiGroups: + - "networking.k8s.io" + resources: + - ingresses + verbs: + - get + - create + - delete +- apiGroups: + - "storage.k8s.io" + resources: + - persistentvolumes + verbs: + - get + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: vllm-role-binding + namespace: vllm +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: vllm-role +subjects: +- apiGroup: rbac.authorization.k8s.io + kind: User + name: test-role +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: vllm-cluster-role-binding +subjects: +- kind: User + name: test-role + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: vllm-cluster-role apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/test/testrunner.py b/test/testrunner.py index bee1946f52f8..65670112f318 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -311,7 +311,7 @@ def main(): if ( build_context == "MAINLINE" and all("base" in image_uri or "vllm" in image_uri for image_uri in all_image_list) - and test_type not in {"functionality_sanity", "security_sanity"} + and test_type not in {"functionality_sanity", "security_sanity", "eks", "ec2"} ): LOGGER.info( f"NOTE: {specific_test_type} tests not supported on base or vllm images. Skipping..." diff --git a/test/vllm_tests/__init__.py b/test/vllm_tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/vllm_tests/infra/__init__.py b/test/vllm_tests/infra/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/vllm_tests/infra/eks_infra.py b/test/vllm_tests/infra/eks_infra.py new file mode 100644 index 000000000000..31ec14b54baa --- /dev/null +++ b/test/vllm_tests/infra/eks_infra.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 + +import os +import sys +import time +import logging +import boto3 +from invoke import run +from .utils.fsx_utils import FsxSetup +from test.test_utils import eks as eks_utils + +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +class EksInfrastructure: + def __init__(self): + self.cluster_name = "vllm-cluster" + self.region = os.getenv("AWS_REGION", "us-west-2") + + def setup_infrastructure(self): + try: + logger.info("Starting EKS infrastructure setup...") + self.validate_required_tools() + self.create_eks_cluster() + self.validate_cluster_setup() + self.setup_fsx_lustre() + self.setup_load_balancer_controller() + logger.info("EKS infrastructure setup completed successfully") + return True + except Exception as e: + logger.error(f"Infrastructure setup failed: {e}") + self.cleanup_infrastructure() + return False + + def setup_eks_tools(self): + logger.info("Setting up EKS tools...") + eks_utils.eks_setup() + self.install_helm() + logger.info("EKS tools setup completed") + + def install_helm(self): + logger.info("Installing Helm...") + result = run("which helm", warn=True) + if result.return_code == 0: + logger.info("Helm already installed") + return + + run( + "curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3" + ) + run("chmod 700 get_helm.sh") + run("./get_helm.sh") + run("rm -f get_helm.sh") + + result = run("which helm", warn=True) + if result.return_code != 0: + raise Exception("Helm installation failed - helm not found in PATH") + + logger.info("Helm installed successfully") + + def validate_required_tools(self): + logger.info("Validating required tools...") + required_tools = ["aws", "eksctl", "kubectl", "helm", "curl", "jq"] + missing_tools = [] + + for tool in required_tools: + result = run(f"which {tool}", warn=True) + if result.return_code != 0: + missing_tools.append(tool) + logger.warning(f"{tool} not found") + else: + logger.info(f"{tool} found: {result.stdout.strip()}") + + if missing_tools: + logger.info(f"Installing missing tools: {', '.join(missing_tools)}") + self.setup_eks_tools() + logger.info("Tools installed successfully") + else: + logger.info("All required tools are available") + + def create_eks_cluster(self): + logger.info("Creating EKS cluster...") + + run(f"eksctl create cluster -f test/vllm_tests/test_artifacts/eks-cluster.yaml") + + run(f"eksctl create nodegroup -f test/vllm_tests/test_artifacts/large-model-nodegroup.yaml") + + eks_utils.eks_write_kubeconfig(self.cluster_name, self.region) + self.setup_iam_identity() + + result = run("kubectl get nodes") + assert "Ready" in result.stdout, "EKS nodes not ready" + logger.info("EKS cluster created successfully") + + def validate_cluster_setup(self): + logger.info("Validating cluster setup...") + + if not eks_utils.is_eks_cluster_active(self.cluster_name): + raise Exception(f"EKS cluster {self.cluster_name} is not active") + + # check NVIDIA device plugin pods + logger.info("Checking NVIDIA device plugin pods...") + result = run("kubectl get pods -n kube-system | grep nvidia") + + if "nvidia-device-plugin" not in result.stdout: + raise Exception("NVIDIA device plugin pods not found") + + # count running NVIDIA pods + nvidia_pods = [ + line + for line in result.stdout.split("\n") + if "nvidia-device-plugin" in line and "Running" in line + ] + logger.info(f"Found {len(nvidia_pods)} running NVIDIA device plugin pods") + + if not nvidia_pods: + raise Exception("No running NVIDIA device plugin pods found") + + # verify GPUs are available + result = run("kubectl get nodes -o json | jq '.items[].status.capacity.\"nvidia.com/gpu\"'") + gpu_counts = [ + line.strip().strip('"') + for line in result.stdout.split("\n") + if line.strip() and line.strip() != "null" + ] + + if not gpu_counts: + raise Exception("No GPUs found in cluster nodes") + + total_gpus = sum(int(count) for count in gpu_counts if count.isdigit()) + logger.info(f"Total GPUs available in cluster: {total_gpus}") + + if total_gpus == 0: + raise Exception("No GPUs available in cluster") + + logger.info("Cluster setup validation completed") + + def setup_fsx_lustre(self): + try: + logger.info("Setting up FSx Lustre filesystem...") + fsx = FsxSetup(self.region) + vpc_id = run( + f"aws eks describe-cluster --name {self.cluster_name} " + f"--query 'cluster.resourcesVpcConfig.vpcId' --output text" + ).stdout.strip() + logger.info(f"Using VPC: {vpc_id}") + + subnet_id = run( + f"aws eks describe-cluster --name {self.cluster_name} " + f"--query 'cluster.resourcesVpcConfig.subnetIds[0]' --output text" + ).stdout.strip() + logger.info(f"Using subnet: {subnet_id}") + + cluster_sg_id = run( + f"aws eks describe-cluster --name {self.cluster_name} " + f"--query 'cluster.resourcesVpcConfig.clusterSecurityGroupId' --output text" + ).stdout.strip() + logger.info(f"Using cluster security group: {cluster_sg_id}") + + sg_id = fsx.create_security_group( + vpc_id=vpc_id, name="fsx-lustre-sg", description="Security group for FSx Lustre" + ) + + fsx.add_security_group_ingress_rules( + security_group_id=sg_id, + ingress_rules=[ + {"protocol": "tcp", "port": "988-1023", "source-group": cluster_sg_id}, + {"protocol": "tcp", "port": "988-1023", "source-group": sg_id}, + ], + ) + + fs_info = fsx.create_fsx_filesystem( + subnet_id=subnet_id, + security_group_ids=[sg_id], + storage_capacity=1200, + deployment_type="SCRATCH_2", + tags={"Name": "vllm-model-storage"}, + ) + + fsx.setup_csi_driver() + + fsx.setup_kubernetes_resources( + storage_class_file="test/vllm_tests/test_artifacts/fsx-storage-class.yaml", + pv_file="test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml", + pvc_file="test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml", + replacements={ + "": subnet_id, + "": sg_id, + "": fs_info["filesystem_id"], + ".fsx.us-west-2.amazonaws.com": fs_info["dns_name"], + "": fs_info["mount_name"], + }, + ) + + logger.info("FSx Lustre setup completed successfully") + + except Exception as e: + logger.error(f"FSx Lustre setup failed: {e}") + raise + + def setup_load_balancer_controller(self): + logger.info("Setting up AWS Load Balancer Controller...") + run("helm repo add eks https://aws.github.io/eks-charts") + run("helm repo update") + run( + "kubectl apply -f https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml" + ) + run( + f"helm install aws-load-balancer-controller eks/aws-load-balancer-controller -n kube-system --set clusterName={self.cluster_name} --set serviceAccount.create=false --set enableServiceMutatorWebhook=false" + ) + # install LeaderWorkerSet controller + run( + "helm install lws oci://registry.k8s.io/lws/charts/lws --version=0.6.1 --namespace lws-system --create-namespace --wait --timeout 300s" + ) + # wait for controllers to be ready + run( + "kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=aws-load-balancer-controller -n kube-system --timeout=300s" + ) + # setup sg for ALB + user_ip = run("curl -s https://checkip.amazonaws.com").stdout.strip() + vpc_id = run( + f"aws eks describe-cluster --name {self.cluster_name} --query 'cluster.resourcesVpcConfig.vpcId' --output text" + ).stdout.strip() + # create ALB sg + alb_sg = run( + f'aws ec2 create-security-group --group-name vllm-alb-sg --description "Security group for vLLM ALB" --vpc-id {vpc_id} --query "GroupId" --output text' + ).stdout.strip() + # allow inbound traffic on port 80 from user IP + run( + f"aws ec2 authorize-security-group-ingress --group-id {alb_sg} --protocol tcp --port 80 --cidr {user_ip}/32" + ) + # get node sg + node_instance_id = run( + 'aws ec2 describe-instances --filters "Name=tag:eks:nodegroup-name,Values=vllm-p4d-nodes-efa" --query "Reservations[0].Instances[0].InstanceId" --output text' + ).stdout.strip() + node_sg = run( + f"aws ec2 describe-instances --instance-ids {node_instance_id} --query 'Reservations[0].Instances[0].SecurityGroups[0].GroupId' --output text" + ).stdout.strip() + # allow traffic from ALB to nodes on port 8000 + run( + f"aws ec2 authorize-security-group-ingress --group-id {node_sg} --protocol tcp --port 8000 --source-group {alb_sg}" + ) + # update the sg in the ingress file + run( + f"sed -i 's||{alb_sg}|g' test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml" + ) + + # verify sg were created and configured correctly + logger.info("Verifying security group configurations...") + + # verify ALB sg + alb_sg_result = run( + f'aws ec2 describe-security-groups --group-ids {alb_sg} --query "SecurityGroups[0].IpPermissions"' + ) + if "80" not in alb_sg_result.stdout: + raise Exception("ALB security group not configured correctly - missing port 80 rule") + logger.info("ALB security group configured correctly") + + # verify node sg rules + node_sg_result = run( + f'aws ec2 describe-security-groups --group-ids {node_sg} --query "SecurityGroups[0].IpPermissions"' + ) + if "8000" not in node_sg_result.stdout: + raise Exception("Node security group not configured correctly - missing port 8000 rule") + + logger.info("Node security group configured correctly") + + logger.info("Load Balancer Controller setup and verification completed") + + def cleanup_resources(self): + logger.info("Running cleanup script...") + try: + script_path = "test/vllm_tests/infra/test_vllm_eks_cleanup.sh" + run(f"chmod +x {script_path}") + run(f"echo 'y' | {script_path}", check=False, timeout=3600) + logger.info("Cleanup completed successfully") + except Exception as e: + logger.error(f"Cleanup failed: {e}") + + def setup_iam_identity(self): + logger.info("Setting up IAM identity mapping...") + + try: + sts_client = boto3.client("sts") + identity = sts_client.get_caller_identity() + codebuild_role_arn = identity["Arn"] + + os.environ["EKS_TEST_ROLE"] = codebuild_role_arn + os.environ["AWS_REGION"] = self.region + + run(f"bash eks_infrastructure/add_iam_identity.sh {self.cluster_name}") + logger.info("IAM identity mapping completed successfully") + except Exception as e: + logger.error(f"Failed to setup IAM identity mapping: {e}") + raise + + def cleanup_infrastructure(self): + try: + self.cleanup_resources() + except Exception as e: + logger.error(f"Infrastructure cleanup failed: {e}") diff --git a/test/vllm_tests/infra/test_vllm_eks_cleanup.sh b/test/vllm_tests/infra/test_vllm_eks_cleanup.sh new file mode 100755 index 000000000000..df06a7d2429b --- /dev/null +++ b/test/vllm_tests/infra/test_vllm_eks_cleanup.sh @@ -0,0 +1,325 @@ +#!/bin/bash +# Cleanup script for vLLM DeepSeek 32B deployment on EKS +# This script deletes all resources created for the vLLM deployment +# with appropriate wait times to ensure proper deletion + +set -e # Exit on error +set -o pipefail # Exit if any command in a pipe fails + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# AWS Profile to use +REGION="us-west-2" +CLUSTER_NAME="vllm-cluster" +NODEGROUP_NAME="vllm-p4d-nodes-efa" + +# Function to print section headers +print_section() { + echo -e "\n${BLUE}=== $1 ===${NC}" +} + +# Function to print success messages +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +# Function to print warning messages +print_warning() { + echo -e "${YELLOW}⚠ $1${NC}" +} + +# Function to print error messages +print_error() { + echo -e "${RED}✗ $1${NC}" +} + +# Function to wait for a resource to be deleted +wait_for_deletion() { + local check_command="$1" + local resource_name="$2" + local timeout_seconds="$3" + local start_time=$(date +%s) + local end_time=$((start_time + timeout_seconds)) + + echo -e "${YELLOW}Waiting for $resource_name to be deleted (timeout: ${timeout_seconds}s)...${NC}" + + while true; do + if ! eval "$check_command" &>/dev/null; then + print_success "$resource_name deleted successfully" + return 0 + fi + + current_time=$(date +%s) + if [ $current_time -gt $end_time ]; then + print_warning "$resource_name deletion timed out after ${timeout_seconds}s" + return 1 + fi + + echo -n "." + sleep 10 + done +} + +# Function to check if a command exists +command_exists() { + command -v "$1" &> /dev/null +} + +# Check for required tools +for cmd in kubectl aws eksctl helm; do + if ! command_exists $cmd; then + print_error "Required command '$cmd' not found. Please install it and try again." + exit 1 + fi +done + +# Confirm with the user +echo -e "${RED}WARNING: This script will delete all resources related to the vLLM deployment.${NC}" +echo -e "${RED}This action is irreversible and will result in data loss.${NC}" +read -p "Are you sure you want to proceed? (y/N): " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Cleanup cancelled." + exit 0 +fi + +# Store security group IDs for later use +print_section "Retrieving security group IDs" +echo "Getting ALB security group ID..." +ALB_SG=$(kubectl get ingress vllm-deepseek-32b-lws-ingress -o jsonpath='{.metadata.annotations.alb\.ingress\.kubernetes\.io/security-groups}' 2>/dev/null || echo "") +if [ -z "$ALB_SG" ]; then + print_warning "Could not retrieve ALB security group ID from ingress. Will try to find it later." +fi + + + +echo "Getting FSx security group ID..." +FSX_ID=$(kubectl get pv fsx-lustre-pv -o jsonpath='{.spec.csi.volumeHandle}' 2>/dev/null | cut -d'/' -f1 || echo "") +if [ -n "$FSX_ID" ]; then + echo "Found FSx filesystem ID: $FSX_ID" + SG_ID=$(aws fsx describe-file-systems --file-system-id $FSX_ID --query "FileSystems[0].NetworkInterfaceIds[0]" --output text 2>/dev/null | xargs -I{} aws ec2 describe-network-interfaces --network-interface-ids {} --query "NetworkInterfaces[0].Groups[0].GroupId" --output text 2>/dev/null || echo "") + if [ -n "$SG_ID" ]; then + echo "Found FSx security group ID: $SG_ID" + else + print_warning "Could not retrieve FSx security group ID." + fi +else + print_warning "Could not retrieve FSx filesystem ID." +fi + +echo "Getting Node security group ID..." +NODE_SG=$(aws ec2 describe-security-groups --filters "Name=tag:aws:cloudformation:logical-id,Values=NodeSecurityGroup" "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "SecurityGroups[0].GroupId" --output text 2>/dev/null || echo "") +if [ -n "$NODE_SG" ]; then + echo "Found Node security group ID: $NODE_SG" +else + print_warning "Could not retrieve Node security group ID." +fi + +echo "Getting VPC ID from the EKS cluster..." +VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.resourcesVpcConfig.vpcId" --output text 2>/dev/null || echo "") +if [ -n "$VPC_ID" ]; then + echo "Found VPC ID: $VPC_ID" + + +else + print_warning "Could not retrieve VPC ID from the EKS cluster." +fi + +# 1. Delete Kubernetes Resources +print_section "Deleting Kubernetes Resources" + +echo "Deleting vLLM ingress..." +kubectl delete -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml --ignore-not-found +print_success "Ingress deletion initiated" + +echo "Waiting 30 seconds for ingress controller to process deletion..." +sleep 30 + +echo "Deleting vLLM LeaderWorkerSet..." +kubectl delete -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml --ignore-not-found +print_success "LeaderWorkerSet deletion initiated" + +echo "Waiting 60 seconds for pods to terminate..." +sleep 60 + +echo "Deleting FSx Lustre PVC..." +kubectl delete -f test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml --ignore-not-found +print_success "PVC deletion initiated" + +echo "Waiting 10 seconds for PVC deletion to process..." +sleep 10 + +echo "Deleting FSx Lustre PV..." +kubectl delete -f test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml --ignore-not-found +print_success "PV deletion initiated" + +echo "Waiting 10 seconds for PV deletion to process..." +sleep 10 + +echo "Deleting storage class..." +kubectl delete -f test/vllm_tests/test_artifacts/fsx-storage-class.yaml --ignore-not-found +print_success "Storage class deletion initiated" + +echo "Deleting AWS Load Balancer Controller..." +helm uninstall aws-load-balancer-controller -n kube-system --ignore-not-found +print_success "AWS Load Balancer Controller deletion initiated" + +echo "Waiting 60 seconds for controller termination..." +sleep 60 + +echo "Verifying all resources are deleted..." +kubectl get pods,svc,ingress,pv,pvc +print_success "Kubernetes resource deletion completed" + +# 2. Delete the IAM Service Account CloudFormation Stack +print_section "Deleting IAM Service Account CloudFormation Stack" + +STACK_NAME="eksctl-${CLUSTER_NAME}-addon-iamserviceaccount-kube-system-aws-load-balancer-controller" +echo "Deleting CloudFormation stack: $STACK_NAME" +aws cloudformation delete-stack --stack-name $STACK_NAME 2>/dev/null || true + +wait_for_deletion "aws cloudformation describe-stacks --stack-name $STACK_NAME" "IAM Service Account CloudFormation Stack" 300 +print_success "IAM Service Account CloudFormation Stack deletion completed" + +# 3. Delete the IAM Policy +print_section "Deleting IAM Policy" + +echo "Getting the ARN of the IAM policy..." +POLICY_ARN=$(aws iam list-policies --query "Policies[?PolicyName=='AWSLoadBalancerControllerIAMPolicy'].Arn" --output text) + +if [ -n "$POLICY_ARN" ] && [ "$POLICY_ARN" != "None" ]; then + echo "Deleting IAM policy: $POLICY_ARN" + aws iam delete-policy --policy-arn $POLICY_ARN + print_success "IAM policy deleted" +else + print_warning "IAM policy not found or already deleted" +fi + +# 4. Delete the FSx Lustre Filesystem +print_section "Deleting FSx Lustre Filesystem" + +if [ -n "$FSX_ID" ]; then + echo "Deleting FSx Lustre filesystem: $FSX_ID" + aws fsx delete-file-system --file-system-id $FSX_ID 2>/dev/null || true + + wait_for_deletion "aws fsx describe-file-systems --file-system-id $FSX_ID" "FSx Lustre filesystem" 600 + print_success "FSx Lustre filesystem deletion completed" +else + print_warning "FSx Lustre filesystem ID not found or already deleted" +fi + +# 5. Check for Any Remaining Load Balancers +print_section "Checking for Remaining Load Balancers" + +echo "Checking for ALBs and NLBs..." +aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(DNSName, '${CLUSTER_NAME}')].LoadBalancerArn" --output text | while read -r lb_arn; do + if [ -n "$lb_arn" ]; then + echo "Deleting load balancer: $lb_arn" + aws elbv2 delete-load-balancer --load-balancer-arn $lb_arn + fi +done + +echo "Checking for Classic ELBs..." +aws elb describe-load-balancers --query "LoadBalancerDescriptions[?contains(DNSName, '${CLUSTER_NAME}')].LoadBalancerName" --output text | while read -r lb_name; do + if [ -n "$lb_name" ]; then + echo "Deleting classic load balancer: $lb_name" + aws elb delete-load-balancer --load-balancer-name $lb_name + fi +done + +print_success "Load balancer cleanup completed" + +# 6. Delete the Node Group +print_section "Deleting Node Group" + +# Check if node group exists before attempting to delete it +echo "Checking if node group exists: $NODEGROUP_NAME" +if eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION &>/dev/null; then + echo "Node group exists. Deleting node group: $NODEGROUP_NAME" + eksctl delete nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION --drain=false + + wait_for_deletion "eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION" "Node group" 1100 + print_success "Node group deletion completed" +else + print_warning "Node group $NODEGROUP_NAME not found or already deleted" +fi + +# 7. Delete the Security Groups +print_section "Deleting Security Groups" + +# Delete security groups in the recommended order: FSx SG -> Node SG -> ALB SG + +if [ -n "$SG_ID" ]; then + echo "Deleting FSx security group: $SG_ID" + aws ec2 delete-security-group --group-id $SG_ID 2>/dev/null || print_warning "Failed to delete FSx security group" + if [ $? -eq 0 ]; then + print_success "FSx security group deleted" + fi +else + print_warning "FSx security group ID not found or already deleted" +fi + +echo "Waiting 30 seconds after FSx security group deletion" +sleep 30 + +if [ -n "$NODE_SG" ]; then + echo "Deleting Node security group: $NODE_SG" + aws ec2 delete-security-group --group-id $NODE_SG 2>/dev/null || print_warning "Failed to delete Node security group" + if [ $? -eq 0 ]; then + print_success "Node security group deleted" + fi +else + print_warning "Node security group ID not found or already deleted" +fi + +echo "Waiting 30 seconds after Node security group deletion" +sleep 30 + + +if [ -n "$ALB_SG" ]; then + echo "Deleting ALB security group: $ALB_SG" + aws ec2 delete-security-group --group-id $ALB_SG 2>/dev/null || print_warning "Failed to delete ALB security group" + if [ $? -eq 0 ]; then + print_success "ALB security group deleted" + fi +else + print_warning "ALB security group ID not found or already deleted" +fi + +echo "Waiting 30 seconds after ALB security group deletion" +sleep 30 + +# 8. Delete the EKS Cluster +print_section "Deleting EKS Cluster" + +echo "Deleting EKS cluster: $CLUSTER_NAME" +eksctl delete cluster --name=$CLUSTER_NAME --region=$REGION + +wait_for_deletion "aws eks describe-cluster --name $CLUSTER_NAME" "EKS cluster" 1100 +print_success "EKS cluster deletion completed" + +# 9. Final Verification +print_section "Final Verification" + +echo "Checking for any remaining CloudFormation stacks..." +REMAINING_STACKS=$(aws cloudformation list-stacks --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE DELETE_FAILED --query "StackSummaries[?contains(StackName, '${CLUSTER_NAME}')].StackName" --output text) + +if [ -n "$REMAINING_STACKS" ]; then + print_warning "Some CloudFormation stacks still exist:" + echo "$REMAINING_STACKS" + echo + echo "You may need to manually delete these stacks or troubleshoot deletion failures." + echo "See the README.md section on 'Troubleshooting CloudFormation Stack Deletion Failures'." +else + print_success "No remaining CloudFormation stacks found" +fi + +print_section "Cleanup Complete" +echo "All resources related to the vLLM deployment have been deleted or cleanup has been initiated." +echo "Some AWS resources may still be in the process of being deleted." +echo "Please check the AWS Management Console to verify all resources have been properly cleaned up." \ No newline at end of file diff --git a/test/vllm_tests/infra/utils/__init__.py b/test/vllm_tests/infra/utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/vllm_tests/infra/utils/fsx_utils.py b/test/vllm_tests/infra/utils/fsx_utils.py new file mode 100644 index 000000000000..a92551d83a4b --- /dev/null +++ b/test/vllm_tests/infra/utils/fsx_utils.py @@ -0,0 +1,250 @@ +import logging +import time +import boto3 +from invoke import run +from typing import Dict, List, Any + +logger = logging.getLogger(__name__) + +class FsxSetup: + """ + A utility class for setting up and managing FSx for Lustre filesystems + and related AWS and Kubernetes resources. + + : param region: AWS region where resources will be created (default: "us-west-2") + """ + def __init__(self, region: str = "us-west-2"): + self.region = region + self.fsx_client = boto3.client('fsx', region_name=region) + self.ec2_client = boto3.client('ec2', region_name=region) + + def create_fsx_filesystem( + self, + subnet_id: str, + security_group_ids: List[str], + storage_capacity: int, + deployment_type: str, + tags: Dict[str, str], + ): + """ + Create FSx Lustre filesystem with given configuration + : param subnet_id: subnet ID where FSx will be created + : param security_group_ids: list of security group IDs + : param storage_capacity: storage capacity in GiB + : param deployment_type: FSx deployment type + : param tags: dictionary of tags to apply to the FSx filesystem + : return: dictionary containing filesystem details + """ + try: + response = self.fsx_client.create_file_system( + FileSystemType='LUSTRE', + StorageCapacity=storage_capacity, + SubnetIds=[subnet_id], + SecurityGroupIds=security_group_ids, + LustreConfiguration={'DeploymentType': deployment_type}, + Tags=[{'Key': k, 'Value': v} for k, v in tags.items()] + ) + + filesystem_id = response['FileSystem']['FileSystemId'] + logger.info(f"Created FSx filesystem: {filesystem_id}") + + return self.wait_for_filesystem(filesystem_id) + + except Exception as e: + logger.error(f"Failed to create FSx filesystem: {e}") + raise + + def wait_for_filesystem(self, filesystem_id: str): + """ + Wait for FSx filesystem to become available and return its details + : param filesystem_id: FSx filesystem ID + : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name) + : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state + """ + logger.info(f"Waiting for FSx filesystem {filesystem_id} to be available...") + + try: + waiter = self.fsx_client.get_waiter('file_system_available') + waiter.wait( + FileSystemIds=[filesystem_id], + WaiterConfig={'Delay': 30, 'MaxAttempts': 60} + ) + + # Get filesystem details + response = self.fsx_client.describe_file_systems( + FileSystemIds=[filesystem_id] + ) + filesystem = response['FileSystems'][0] + + return { + 'filesystem_id': filesystem_id, + 'dns_name': filesystem['DNSName'], + 'mount_name': filesystem['LustreConfiguration']['MountName'] + } + + except Exception as e: + logger.error(f"Error waiting for filesystem {filesystem_id}: {e}") + raise + + def create_security_group( + self, + vpc_id: str, + name: str, + description: str + ): + """ + Create a security group in the specified VPC + : param vpc_id: VPC ID where the security group will be created + : param name: name of the security group + : param description: description of the security group + : return: created security group ID + : raises: Exception if security group creation fails + """ + try: + response = self.ec2_client.create_security_group( + GroupName=name, + Description=description, + VpcId=vpc_id + ) + sg_id = response['GroupId'] + logger.info(f"Created security group: {sg_id}") + return sg_id + + except Exception as e: + logger.error(f"Failed to create security group: {e}") + raise + + def add_security_group_ingress_rules( + self, + security_group_id: str, + ingress_rules: List[Dict[str, Any]] + ): + """ + Add ingress rules to an existing security group + : param security_group_id: ID of the security group to modify + : param ingress_rules: list of dictionaries containing ingress rule configurations + Example: [{"protocol": "tcp", "port": "988-1023", "source-group": "sg-xxx"}] + : return: None + : raises: Exception if adding ingress rules fails + """ + try: + ip_permissions = [] + for rule in ingress_rules: + from_port, to_port = map(int, rule['port'].split('-')) + permission = { + 'IpProtocol': rule['protocol'], + 'FromPort': from_port, + 'ToPort': to_port, + 'UserIdGroupPairs': [{ + 'GroupId': rule['source-group'] + }] + } + ip_permissions.append(permission) + + self.ec2_client.authorize_security_group_ingress( + GroupId=security_group_id, + IpPermissions=ip_permissions + ) + + logger.info(f"Added ingress rules to security group: {security_group_id}") + + except Exception as e: + logger.error(f"Failed to add ingress rules to security group: {e}") + raise + + def setup_csi_driver(self): + """ + Install and configure the AWS FSx CSI Driver in the Kubernetes cluster + : return: None + : raises: Exception if driver installation or verification fails + """ + try: + logger.info("Installing AWS FSx CSI Driver...") + run("helm repo add aws-fsx-csi-driver https://kubernetes-sigs.github.io/aws-fsx-csi-driver/") + run("helm repo update") + run("helm install aws-fsx-csi-driver aws-fsx-csi-driver/aws-fsx-csi-driver --namespace kube-system") + run("kubectl wait --for=condition=ready pod -l app=fsx-csi-controller -n kube-system --timeout=300s") + + self._verify_csi_driver() + logger.info("FSx CSI Driver installed successfully") + except Exception as e: + logger.error(f"Failed to setup FSx CSI driver: {e}") + raise + + def _verify_csi_driver(self): + """ + Verify that FSx CSI driver pods are running correctly in the cluster + : return: None + : raises: Exception if driver pods are not found or not running + """ + result = run("kubectl get pods -n kube-system | grep fsx") + + if "fsx-csi-controller" not in result.stdout or "fsx-csi-node" not in result.stdout: + raise Exception("FSx CSI driver pods not found") + + fsx_pods = [ + line for line in result.stdout.split("\n") + if ("fsx-csi-controller" in line or "fsx-csi-node" in line) and "Running" in line + ] + + if not fsx_pods: + raise Exception("No running FSx CSI driver pods found") + + logger.info(f"Found {len(fsx_pods)} running FSx CSI driver pods") + + def setup_kubernetes_resources( + self, + storage_class_file: str, + pv_file: str, + pvc_file: str, + replacements: Dict[str, str] + ): + """ + Setup Kubernetes FSx resources using provided yaml files and replacements + : param storage_class_file: path to the storage class yaml file + : param pv_file: path to the persistent volume yaml file + : param pvc_file: path to the persistent volume claim yaml file + : param replacements: dictionary of placeholder replacements + Example: {"": "subnet-xxx", "": "sg-xxx"} + : return: None + : raises: Exception if resource creation fails + """ + try: + for file_path in [storage_class_file, pv_file, pvc_file]: + for key, value in replacements.items(): + run(f"sed -i 's|{key}|{value}|g' {file_path}") + + for file_path in [storage_class_file, pv_file, pvc_file]: + run(f"kubectl apply -f {file_path}") + + self.validate_kubernetes_resources() + + except Exception as e: + logger.error(f"Failed to setup Kubernetes FSx resources: {e}") + raise + + def validate_kubernetes_resources(self): + """ + Validate that FSx Kubernetes resources are properly created and bound + : return: True if all resources are validated successfully + : raises: Exception if any resource validation fails + """ + try: + sc_result = run("kubectl get sc fsx-sc") + if "fsx-sc" not in sc_result.stdout or "fsx.csi.aws.com" not in sc_result.stdout: + raise Exception("FSx storage class not created correctly") + + pv_result = run("kubectl get pv fsx-lustre-pv") + if "fsx-lustre-pv" not in pv_result.stdout or "Bound" not in pv_result.stdout: + raise Exception("FSx persistent volume not created correctly") + + pvc_result = run("kubectl get pvc fsx-lustre-pvc") + if "fsx-lustre-pvc" not in pvc_result.stdout or "Bound" not in pvc_result.stdout: + raise Exception("FSx persistent volume claim not created correctly") + + logger.info("FSx Kubernetes resources validated successfully") + return True + + except Exception as e: + logger.error(f"FSx resource validation failed: {e}") + raise \ No newline at end of file diff --git a/test/vllm_tests/test_artifacts/__init__.py b/test/vllm_tests/test_artifacts/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/vllm_tests/test_artifacts/eks-cluster.yaml b/test/vllm_tests/test_artifacts/eks-cluster.yaml new file mode 100644 index 000000000000..6e59c29bc804 --- /dev/null +++ b/test/vllm_tests/test_artifacts/eks-cluster.yaml @@ -0,0 +1,23 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: vllm-cluster + region: us-west-2 + version: "1.31" # Latest stable EKS version + +# Enable CloudWatch logging +cloudWatch: + clusterLogging: + enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"] + +# Add-ons for the cluster +addons: + - name: vpc-cni + version: latest + - name: coredns + version: latest + - name: kube-proxy + version: latest + - name: aws-ebs-csi-driver + version: latest diff --git a/test/vllm_tests/test_artifacts/eks_test.py b/test/vllm_tests/test_artifacts/eks_test.py new file mode 100644 index 000000000000..2051eea40554 --- /dev/null +++ b/test/vllm_tests/test_artifacts/eks_test.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +import logging +import time +from invoke import run + +logger = logging.getLogger(__name__) + +class VllmEksTest: + def __init__(self): + pass + + + def run_tests(self): + try: + logger.info("Starting vLLM EKS integration tests...") + self.deploy_vllm_service() + self.test_vllm_api() + logger.info("All vLLM EKS tests completed successfully") + return True + except Exception as e: + logger.error(f"Test execution failed: {e}") + return False + + + def deploy_vllm_service(self): + logger.info("Deploying vLLM service...") + + self._wait_for_load_balancer_controller() + + logger.info("Applying vLLM LeaderWorkerSet configuration...") + run("kubectl apply -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml") + + logger.info("Applying vLLM ingress configuration...") + run("kubectl apply -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml") + + self._wait_for_vllm_pods() + + logger.info("vLLM service deployed successfully") + + + def _wait_for_load_balancer_controller(self): + logger.info("Waiting for AWS Load Balancer Controller to be ready...") + max_retries = 20 # 10 minutes total + retry_count = 0 + + while retry_count < max_retries: + result = run("kubectl get pods -n kube-system | grep aws-load-balancer-controller", warn=True) + if "aws-load-balancer-controller" in result.stdout: + all_alb_pods = [ + line for line in result.stdout.split("\n") + if "aws-load-balancer-controller" in line and line.strip() + ] + running_alb_pods = [ + line for line in all_alb_pods if "Running" in line + ] + if all_alb_pods and len(running_alb_pods) == len(all_alb_pods): + logger.info(f"All {len(running_alb_pods)} AWS Load Balancer Controller pods are running") + return + else: + logger.info(f"ALB controller pods: {len(running_alb_pods)}/{len(all_alb_pods)} running") + + retry_count += 1 + logger.info(f"ALB controller not ready yet, waiting... (attempt {retry_count}/{max_retries})") + time.sleep(30) + + raise Exception("AWS Load Balancer Controller pods failed to start after 10 minutes") + + + def _wait_for_vllm_pods(self): + logger.info("Waiting for vLLM pods to reach Running status...") + logger.info("This may take 15-30 minutes for container image pull and model loading") + + max_retries = 60 # 30 minutes total + retry_count = 0 + + while retry_count < max_retries: + result = run("kubectl get pods -l app=vllm-deepseek-32b-lws", warn=True) + if "vllm-deepseek-32b-lws" in result.stdout: + all_vllm_pods = [ + line for line in result.stdout.split("\n") + if "vllm-deepseek-32b-lws" in line and line.strip() and "NAME" not in line + ] + running_vllm_pods = [ + line for line in all_vllm_pods if "Running" in line + ] + if all_vllm_pods and len(running_vllm_pods) == len(all_vllm_pods): + logger.info(f"All {len(running_vllm_pods)} vLLM pods are running") + return + else: + statuses = [] + for line in all_vllm_pods: + parts = line.split() + if len(parts) >= 3: + pod_name = parts[0] + status = parts[2] + statuses.append(f"{pod_name}: {status}") + logger.info(f"vLLM pods status: {', '.join(statuses)}") + + retry_count += 1 + logger.info(f"vLLM pods not ready yet, waiting... (attempt {retry_count}/{max_retries})") + time.sleep(30) + + raise Exception("vLLM pods failed to reach Running status after 30 minutes") + + + def test_vllm_api(self): + logger.info("Testing vLLM API...") + endpoint = run( + "kubectl get ingress vllm-deepseek-32b-lws-ingress -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'" + ).stdout.strip() + logger.info(f"vLLM API endpoint: {endpoint}") + + if not endpoint: + raise Exception("Failed to get vLLM API endpoint from ingress") + + self._test_completions_api(endpoint) + self._test_chat_completions_api(endpoint) + logger.info("All vLLM API tests passed successfully") + + + def _test_completions_api(self, endpoint): + logger.info("Testing completions API...") + result = run( + f"""curl -X POST http://{endpoint}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{{"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "prompt": "Hello, how are you?", "max_tokens": 50, "temperature": 0.7}}' + """ + ) + assert '"object":"text_completion"' in result.stdout, "vLLM completions API test failed" + logger.info("Completions API test passed") + + + def _test_chat_completions_api(self, endpoint): + logger.info("Testing chat completions API...") + result = run( + f"""curl -X POST http://{endpoint}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{{"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "messages": [{{"role": "user", "content": "What are the benefits of using FSx Lustre with EKS?"}}], "max_tokens": 100, "temperature": 0.7}}' + """ + ) + assert '"object":"chat.completion"' in result.stdout, "vLLM chat completions API test failed" + logger.info("Chat completions API test passed") \ No newline at end of file diff --git a/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml b/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml new file mode 100644 index 000000000000..d94ba75ad632 --- /dev/null +++ b/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: fsx-lustre-pv +spec: + capacity: + storage: 1200Gi # Adjust based on your FSx Lustre filesystem size + volumeMode: Filesystem + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: fsx-sc + csi: + driver: fsx.csi.aws.com + volumeHandle: # FSx Lustre filesystem ID + volumeAttributes: + dnsname: .fsx.us-west-2.amazonaws.com # FSx Lustre DNS name + mountname: # The mount name of your FSx Lustre filesyst diff --git a/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml b/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml new file mode 100644 index 000000000000..f03c420f864c --- /dev/null +++ b/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: fsx-lustre-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: fsx-sc + resources: + requests: + storage: 1200Gi # Should match the PV capacity diff --git a/test/vllm_tests/test_artifacts/fsx-storage-class.yaml b/test/vllm_tests/test_artifacts/fsx-storage-class.yaml new file mode 100644 index 000000000000..31b81d8495ff --- /dev/null +++ b/test/vllm_tests/test_artifacts/fsx-storage-class.yaml @@ -0,0 +1,18 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: fsx-sc +provisioner: fsx.csi.aws.com +parameters: + subnetId: + securityGroupIds: + deploymentType: SCRATCH_2 + automaticBackupRetentionDays: "0" + dailyAutomaticBackupStartTime: "00:00" + copyTagsToBackups: "false" + perUnitStorageThroughput: "50" + dataCompressionType: "NONE" +reclaimPolicy: Retain +volumeBindingMode: Immediate +mountOptions: + - flock diff --git a/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml b/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml new file mode 100644 index 000000000000..093777fe6b50 --- /dev/null +++ b/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml @@ -0,0 +1,53 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: vllm-cluster + region: us-west-2 + +managedNodeGroups: + - name: vllm-p4d-nodes-efa + instanceType: p4d.24xlarge + minSize: 0 + maxSize: 2 + desiredCapacity: 2 + availabilityZones: ["us-west-2a"] # EFA-enabled nodegroups must have only one subnet or one availability zone + volumeSize: 100 + privateNetworking: true + # Use the EKS-optimized GPU AMI + ami: ami-01f1fc27c5979ac62 # Amazon EKS GPU node 1.31 (k8s: 1.31.7, containerd: 1.7.*) + amiFamily: AmazonLinux2 + labels: + role: large-model-worker + nvidia.com/gpu: "true" + k8s.amazonaws.com/accelerator: nvidia-gpu + aws.amazon.com/efa: "true" # Add EFA label + tags: + nodegroup-role: large-model-worker + iam: + withAddonPolicies: + autoScaler: true + albIngress: true + cloudWatch: true + ebs: true + imageBuilder: true + # Enable EFA interfaces + efaEnabled: true + # Override bootstrap command for custom AMI + overrideBootstrapCommand: | + #!/bin/bash + set -ex + + # Install EFA driver and related packages + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz + tar -xf aws-efa-installer-latest.tar.gz + cd aws-efa-installer + ./efa_installer.sh -y + + # Configure NCCL to use EFA + echo "export FI_PROVIDER=efa" >> /etc/environment + echo "export FI_EFA_USE_DEVICE_RDMA=1" >> /etc/environment + echo "export NCCL_DEBUG=INFO" >> /etc/environment + + # Standard EKS bootstrap + /etc/eks/bootstrap.sh vllm-cluster --container-runtime containerd diff --git a/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml new file mode 100644 index 000000000000..5b5706eccd88 --- /dev/null +++ b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vllm-deepseek-32b-lws-ingress + annotations: + # Use AWS Load Balancer Controller with ALB + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/security-groups: + alb.ingress.kubernetes.io/healthcheck-path: /health + alb.ingress.kubernetes.io/healthcheck-port: '8000' + alb.ingress.kubernetes.io/healthcheck-protocol: HTTP + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]' + alb.ingress.kubernetes.io/load-balancer-attributes: load_balancing.cross_zone.enabled=true + # Specify ALB class + kubernetes.io/ingress.class: alb +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: vllm-deepseek-32b-lws-leader + port: + number: 8000 \ No newline at end of file diff --git a/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml new file mode 100644 index 000000000000..d79afe1153af --- /dev/null +++ b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml @@ -0,0 +1,252 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: vllm-deepseek-32b-lws +spec: + replicas: 1 + leaderWorkerTemplate: + size: 2 # Total number of nodes (1 leader + 1 worker) + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - name: vllm-leader + image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.8.5-gpu-py312-ec2 + securityContext: + privileged: true + capabilities: + add: ["IPC_LOCK"] + env: + # Ray configuration + - name: RAY_DISABLE_RUNTIME_ENV + value: "1" + - name: RAY_SCHEDULER_EVENTS + value: "0" + - name: RAY_WORKER_REGISTER_TIMEOUT_SECONDS + value: "300" + # NCCL configuration for distributed training + - name: NCCL_DEBUG + value: "INFO" + - name: NCCL_IB_DISABLE + value: "1" + - name: NCCL_P2P_DISABLE + value: "1" + - name: NCCL_NET_GDR_LEVEL + value: "0" + - name: NCCL_SHM_DISABLE + value: "1" + # EFA-specific environment variables + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + # Hugging Face configuration + - name: TRANSFORMERS_CACHE + value: "/mnt/fsx/models" + - name: HF_HOME + value: "/mnt/fsx/models" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-token + key: token + optional: true + # Add host IP for Ray + - name: VLLM_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + command: ["/bin/bash"] + args: + - "-c" + - | + set -x + + # Start ray leader + ray start --head --port=6379 --num-cpus=48 --num-gpus=8 + sleep 10 + ray status + fi_info -p efa + + # Start vllm server + python -m vllm.entrypoints.openai.api_server \ + --model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 \ + --download-dir /mnt/fsx/models \ + --max-model-len 4096 \ + --gpu-memory-utilization 0.85 + resources: + limits: + nvidia.com/gpu: "8" + cpu: "48" + memory: "256Gi" + vpc.amazonaws.com/efa: 4 + requests: + nvidia.com/gpu: "8" + cpu: "48" + memory: "256Gi" + vpc.amazonaws.com/efa: 4 + ports: + - containerPort: 8000 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 10 + volumeMounts: + - name: fsx-lustre-volume + mountPath: /mnt/fsx + # Mount the EFA devices + #- name: efa-devices + # mountPath: /dev/infiniband + # Mount a larger shared memory volume + - name: dshm + mountPath: /dev/shm + volumes: + - name: fsx-lustre-volume + persistentVolumeClaim: + claimName: fsx-lustre-pvc + # Add volume for EFA devices + #- name: efa-devices + # hostPath: + # path: /dev/infiniband + # Add a larger shared memory volume + - name: dshm + emptyDir: + medium: Memory + sizeLimit: "30Gi" # Increase shared memory size + nodeSelector: + role: large-model-worker + # Add tolerations for EFA + tolerations: + - key: "aws.amazon.com/efa" + operator: "Exists" + effect: "NoSchedule" + workerTemplate: + spec: + containers: + - name: vllm-worker + image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.8.5-gpu-py312-ec2 + securityContext: + privileged: true + capabilities: + add: ["IPC_LOCK"] + env: + # Ray configuration + - name: RAY_DISABLE_RUNTIME_ENV + value: "1" + - name: RAY_SCHEDULER_EVENTS + value: "0" + - name: RAY_WORKER_REGISTER_TIMEOUT_SECONDS + value: "300" + # NCCL configuration for distributed training + - name: NCCL_DEBUG + value: "INFO" + - name: NCCL_IB_DISABLE + value: "1" + - name: NCCL_P2P_DISABLE + value: "1" + - name: NCCL_NET_GDR_LEVEL + value: "0" + - name: NCCL_SHM_DISABLE + value: "1" + # EFA-specific environment variables + - name: FI_PROVIDER + value: "efa" + - name: FI_EFA_USE_DEVICE_RDMA + value: "1" + - name: FI_EFA_FORK_SAFE + value: "1" + # Hugging Face configuration + - name: TRANSFORMERS_CACHE + value: "/mnt/fsx/models" + - name: HF_HOME + value: "/mnt/fsx/models" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-token + key: token + optional: true + # Add host IP for Ray + - name: VLLM_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + command: ["/bin/bash"] + args: + - "-c" + - | + set -x + + # Start ray worker + ray start --address=$(LWS_LEADER_ADDRESS):6379 --num-cpus=48 --num-gpus=8 --block + resources: + limits: + nvidia.com/gpu: "8" + cpu: "48" + memory: "256Gi" + vpc.amazonaws.com/efa: 4 + requests: + nvidia.com/gpu: "8" + cpu: "48" + memory: "256Gi" + vpc.amazonaws.com/efa: 4 + volumeMounts: + - name: fsx-lustre-volume + mountPath: /mnt/fsx + # Mount the EFA devices + #- name: efa-devices + # mountPath: /dev/infiniband + # Mount a larger shared memory volume + - name: dshm + mountPath: /dev/shm + volumes: + - name: fsx-lustre-volume + persistentVolumeClaim: + claimName: fsx-lustre-pvc + # Add volume for EFA devices + #- name: efa-devices + # hostPath: + # path: /dev/infiniband + # Add a larger shared memory volume + - name: dshm + emptyDir: + medium: Memory + sizeLimit: "30Gi" # Increase shared memory size + nodeSelector: + role: large-model-worker + # Add tolerations for EFA + tolerations: + - key: "aws.amazon.com/efa" + operator: "Exists" + effect: "NoSchedule" +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-deepseek-32b-lws-leader +spec: + ports: + - name: port-8000 + port: 8000 + targetPort: 8000 + - name: port-8265 + port: 8265 + targetPort: 8265 + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: vllm-deepseek-32b-lws + role: leader \ No newline at end of file diff --git a/test/vllm_tests/vllm_test_trigger.py b/test/vllm_tests/vllm_test_trigger.py new file mode 100644 index 000000000000..8a8b33a7ad80 --- /dev/null +++ b/test/vllm_tests/vllm_test_trigger.py @@ -0,0 +1,82 @@ +import os, sys +import logging +from typing import List + +from test.test_utils import get_dlc_images +from test.vllm_tests.infra.eks_infra import EksInfrastructure +from test.vllm_tests.test_artifacts.eks_test import VllmEksTest + +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.DEBUG) +LOGGER.addHandler(logging.StreamHandler(sys.stdout)) + + +def run_vllm_eks_test(): + infrastructure = None + try: + LOGGER.info("Setting up EKS infrastructure...") + infrastructure = EksInfrastructure() + if not infrastructure.setup_infrastructure(): + raise Exception("Infrastructure setup failed") + LOGGER.info("Infrastructure setup completed successfully") + + LOGGER.info("Starting vLLM tests...") + test = VllmEksTest() + if not test.run_tests(): + raise Exception("vLLM tests failed") + LOGGER.info("vLLM tests completed successfully") + return 0 + + except Exception as e: + LOGGER.error(f"Test execution failed: {e}") + return 1 + + finally: + if infrastructure: + LOGGER.info("Cleaning up infrastructure...") + infrastructure.cleanup_infrastructure() + LOGGER.info("Cleanup completed") + + +def run_platform_tests(platform: str, images: List[str], commit_id: str, ipv6_enabled: bool): + """ + Run tests for a specific platform + """ + LOGGER.info(f"Running {platform} tests") + if platform == "eks": + result = run_vllm_eks_test() + if result != 0: + raise Exception("vLLM EKS tests failed") + LOGGER.info("vLLM EKS tests completed successfully") + + +def main(): + LOGGER.info("Triggering test from vllm") + test_type = os.getenv("TEST_TYPE") + + LOGGER.info(f"TEST_TYPE: {test_type}") + + executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" + dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() + + ipv6_enabled = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true" + os.environ["ENABLE_IPV6_TESTING"] = "true" if ipv6_enabled else "false" + + commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", default="unrecognised_commit_id") + LOGGER.info(f"Commit ID: {commit_id}") + + LOGGER.info(f"Images tested: {dlc_images}") + all_image_list = dlc_images.split(" ") + standard_images_list = [image_uri for image_uri in all_image_list if "example" not in image_uri] + LOGGER.info(f"\nImages URIs:\n{standard_images_list}") + + run_platform_tests( + platform=test_type, + images=standard_images_list, + commit_id=commit_id, + ipv6_enabled=ipv6_enabled, + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/testspec.yml b/testspec.yml index 31abd98c35dd..fffb6658d4a4 100644 --- a/testspec.yml +++ b/testspec.yml @@ -23,7 +23,8 @@ phases: - pip install scheduler/. - echo Running pytest $TEST_TYPE tests on $DLC_IMAGES... - export PYTHONPATH=$PYTHONPATH:$(pwd)/src - - python test/testrunner.py + # - python test/testrunner.py + - python test/vllm_tests/vllm_test_trigger.py post_build: commands: - python src/send_status.py --status $CODEBUILD_BUILD_SUCCEEDING diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml index 1de64837c3c0..1fbba0e18540 100644 --- a/vllm/buildspec.yml +++ b/vllm/buildspec.yml @@ -49,3 +49,4 @@ images: test_platforms: - sanity - security + - eks \ No newline at end of file