diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 1962bfd69e21..9fb6f973b22d 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -37,16 +37,16 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["vllm"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
-build_training = true
-build_inference = true
+build_training = false
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = false
 
 [notify]
 ### Notify on test failures
diff --git a/eks_infrastructure/rbac.yaml b/eks_infrastructure/rbac.yaml
index f0ca6ebd7b1d..24050c05e94d 100644
--- a/eks_infrastructure/rbac.yaml
+++ b/eks_infrastructure/rbac.yaml
@@ -181,4 +181,80 @@ subjects:
 roleRef:
   kind: ClusterRole
   name: eks-cluster-role
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: vllm-role
+  namespace: vllm
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  - pods/log
+  - services
+  - secrets
+  - persistentvolumeclaims 
+  verbs:
+  - get
+  - list
+  - create
+  - delete
+- apiGroups:
+  - "leaderworkerset.x-k8s.io"
+  resources:
+  - leaderworkersets
+  verbs:
+  - get
+  - create
+  - delete
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: vllm-cluster-role
+rules:
+- apiGroups:
+  - "networking.k8s.io"
+  resources:
+  - ingresses
+  verbs:
+  - get
+  - create
+  - delete
+- apiGroups:
+  - "storage.k8s.io"
+  resources:
+  - persistentvolumes
+  verbs:
+  - get
+  - create
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: vllm-role-binding
+  namespace: vllm
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: vllm-role
+subjects:
+- apiGroup: rbac.authorization.k8s.io
+  kind: User
+  name: test-role
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: vllm-cluster-role-binding
+subjects:
+- kind: User
+  name: test-role
+  apiGroup: rbac.authorization.k8s.io
+roleRef:
+  kind: ClusterRole
+  name: vllm-cluster-role
   apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
diff --git a/test/testrunner.py b/test/testrunner.py
index bee1946f52f8..65670112f318 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -311,7 +311,7 @@ def main():
     if (
         build_context == "MAINLINE"
         and all("base" in image_uri or "vllm" in image_uri for image_uri in all_image_list)
-        and test_type not in {"functionality_sanity", "security_sanity"}
+        and test_type not in {"functionality_sanity", "security_sanity", "eks", "ec2"}
     ):
         LOGGER.info(
             f"NOTE: {specific_test_type} tests not supported on base or vllm images. Skipping..."
diff --git a/test/vllm_tests/__init__.py b/test/vllm_tests/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/vllm_tests/infra/__init__.py b/test/vllm_tests/infra/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/vllm_tests/infra/eks_infra.py b/test/vllm_tests/infra/eks_infra.py
new file mode 100644
index 000000000000..31ec14b54baa
--- /dev/null
+++ b/test/vllm_tests/infra/eks_infra.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import time
+import logging
+import boto3
+from invoke import run
+from .utils.fsx_utils import FsxSetup
+from test.test_utils import eks as eks_utils
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+class EksInfrastructure:
+    def __init__(self):
+        self.cluster_name = "vllm-cluster"
+        self.region = os.getenv("AWS_REGION", "us-west-2")
+
+    def setup_infrastructure(self):
+        try:
+            logger.info("Starting EKS infrastructure setup...")
+            self.validate_required_tools()
+            self.create_eks_cluster()
+            self.validate_cluster_setup()
+            self.setup_fsx_lustre()
+            self.setup_load_balancer_controller()
+            logger.info("EKS infrastructure setup completed successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Infrastructure setup failed: {e}")
+            self.cleanup_infrastructure()
+            return False
+
+    def setup_eks_tools(self):
+        logger.info("Setting up EKS tools...")
+        eks_utils.eks_setup()
+        self.install_helm()
+        logger.info("EKS tools setup completed")
+
+    def install_helm(self):
+        logger.info("Installing Helm...")
+        result = run("which helm", warn=True)
+        if result.return_code == 0:
+            logger.info("Helm already installed")
+            return
+
+        run(
+            "curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3"
+        )
+        run("chmod 700 get_helm.sh")
+        run("./get_helm.sh")
+        run("rm -f get_helm.sh")
+
+        result = run("which helm", warn=True)
+        if result.return_code != 0:
+            raise Exception("Helm installation failed - helm not found in PATH")
+
+        logger.info("Helm installed successfully")
+
+    def validate_required_tools(self):
+        logger.info("Validating required tools...")
+        required_tools = ["aws", "eksctl", "kubectl", "helm", "curl", "jq"]
+        missing_tools = []
+
+        for tool in required_tools:
+            result = run(f"which {tool}", warn=True)
+            if result.return_code != 0:
+                missing_tools.append(tool)
+                logger.warning(f"{tool} not found")
+            else:
+                logger.info(f"{tool} found: {result.stdout.strip()}")
+
+        if missing_tools:
+            logger.info(f"Installing missing tools: {', '.join(missing_tools)}")
+            self.setup_eks_tools()
+            logger.info("Tools installed successfully")
+        else:
+            logger.info("All required tools are available")
+
+    def create_eks_cluster(self):
+        logger.info("Creating EKS cluster...")
+
+        run(f"eksctl create cluster -f test/vllm_tests/test_artifacts/eks-cluster.yaml")
+
+        run(f"eksctl create nodegroup -f test/vllm_tests/test_artifacts/large-model-nodegroup.yaml")
+
+        eks_utils.eks_write_kubeconfig(self.cluster_name, self.region)
+        self.setup_iam_identity()
+
+        result = run("kubectl get nodes")
+        assert "Ready" in result.stdout, "EKS nodes not ready"
+        logger.info("EKS cluster created successfully")
+
+    def validate_cluster_setup(self):
+        logger.info("Validating cluster setup...")
+
+        if not eks_utils.is_eks_cluster_active(self.cluster_name):
+            raise Exception(f"EKS cluster {self.cluster_name} is not active")
+
+        # check NVIDIA device plugin pods
+        logger.info("Checking NVIDIA device plugin pods...")
+        result = run("kubectl get pods -n kube-system | grep nvidia")
+
+        if "nvidia-device-plugin" not in result.stdout:
+            raise Exception("NVIDIA device plugin pods not found")
+
+        # count running NVIDIA pods
+        nvidia_pods = [
+            line
+            for line in result.stdout.split("\n")
+            if "nvidia-device-plugin" in line and "Running" in line
+        ]
+        logger.info(f"Found {len(nvidia_pods)} running NVIDIA device plugin pods")
+
+        if not nvidia_pods:
+            raise Exception("No running NVIDIA device plugin pods found")
+
+        # verify GPUs are available
+        result = run("kubectl get nodes -o json | jq '.items[].status.capacity.\"nvidia.com/gpu\"'")
+        gpu_counts = [
+            line.strip().strip('"')
+            for line in result.stdout.split("\n")
+            if line.strip() and line.strip() != "null"
+        ]
+
+        if not gpu_counts:
+            raise Exception("No GPUs found in cluster nodes")
+
+        total_gpus = sum(int(count) for count in gpu_counts if count.isdigit())
+        logger.info(f"Total GPUs available in cluster: {total_gpus}")
+
+        if total_gpus == 0:
+            raise Exception("No GPUs available in cluster")
+
+        logger.info("Cluster setup validation completed")
+
+    def setup_fsx_lustre(self):
+        try:
+            logger.info("Setting up FSx Lustre filesystem...")
+            fsx = FsxSetup(self.region)
+            vpc_id = run(
+                f"aws eks describe-cluster --name {self.cluster_name} "
+                f"--query 'cluster.resourcesVpcConfig.vpcId' --output text"
+            ).stdout.strip()
+            logger.info(f"Using VPC: {vpc_id}")
+
+            subnet_id = run(
+                f"aws eks describe-cluster --name {self.cluster_name} "
+                f"--query 'cluster.resourcesVpcConfig.subnetIds[0]' --output text"
+            ).stdout.strip()
+            logger.info(f"Using subnet: {subnet_id}")
+
+            cluster_sg_id = run(
+                f"aws eks describe-cluster --name {self.cluster_name} "
+                f"--query 'cluster.resourcesVpcConfig.clusterSecurityGroupId' --output text"
+            ).stdout.strip()
+            logger.info(f"Using cluster security group: {cluster_sg_id}")
+
+            sg_id = fsx.create_security_group(
+                vpc_id=vpc_id, name="fsx-lustre-sg", description="Security group for FSx Lustre"
+            )
+
+            fsx.add_security_group_ingress_rules(
+                security_group_id=sg_id,
+                ingress_rules=[
+                    {"protocol": "tcp", "port": "988-1023", "source-group": cluster_sg_id},
+                    {"protocol": "tcp", "port": "988-1023", "source-group": sg_id},
+                ],
+            )
+
+            fs_info = fsx.create_fsx_filesystem(
+                subnet_id=subnet_id,
+                security_group_ids=[sg_id],
+                storage_capacity=1200,
+                deployment_type="SCRATCH_2",
+                tags={"Name": "vllm-model-storage"},
+            )
+
+            fsx.setup_csi_driver()
+
+            fsx.setup_kubernetes_resources(
+                storage_class_file="test/vllm_tests/test_artifacts/fsx-storage-class.yaml",
+                pv_file="test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml",
+                pvc_file="test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml",
+                replacements={
+                    "<subnet-id>": subnet_id,
+                    "<sg-id>": sg_id,
+                    "<fs-id>": fs_info["filesystem_id"],
+                    "<fs-id>.fsx.us-west-2.amazonaws.com": fs_info["dns_name"],
+                    "<mount-name>": fs_info["mount_name"],
+                },
+            )
+
+            logger.info("FSx Lustre setup completed successfully")
+
+        except Exception as e:
+            logger.error(f"FSx Lustre setup failed: {e}")
+            raise
+
+    def setup_load_balancer_controller(self):
+        logger.info("Setting up AWS Load Balancer Controller...")
+        run("helm repo add eks https://aws.github.io/eks-charts")
+        run("helm repo update")
+        run(
+            "kubectl apply -f https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml"
+        )
+        run(
+            f"helm install aws-load-balancer-controller eks/aws-load-balancer-controller -n kube-system --set clusterName={self.cluster_name} --set serviceAccount.create=false --set enableServiceMutatorWebhook=false"
+        )
+        # install LeaderWorkerSet controller
+        run(
+            "helm install lws oci://registry.k8s.io/lws/charts/lws --version=0.6.1 --namespace lws-system --create-namespace --wait --timeout 300s"
+        )
+        # wait for controllers to be ready
+        run(
+            "kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=aws-load-balancer-controller -n kube-system --timeout=300s"
+        )
+        # setup sg for ALB
+        user_ip = run("curl -s https://checkip.amazonaws.com").stdout.strip()
+        vpc_id = run(
+            f"aws eks describe-cluster --name {self.cluster_name} --query 'cluster.resourcesVpcConfig.vpcId' --output text"
+        ).stdout.strip()
+        # create ALB sg
+        alb_sg = run(
+            f'aws ec2 create-security-group --group-name vllm-alb-sg --description "Security group for vLLM ALB" --vpc-id {vpc_id} --query "GroupId" --output text'
+        ).stdout.strip()
+        # allow inbound traffic on port 80 from user IP
+        run(
+            f"aws ec2 authorize-security-group-ingress --group-id {alb_sg} --protocol tcp --port 80 --cidr {user_ip}/32"
+        )
+        # get node sg
+        node_instance_id = run(
+            'aws ec2 describe-instances --filters "Name=tag:eks:nodegroup-name,Values=vllm-p4d-nodes-efa" --query "Reservations[0].Instances[0].InstanceId" --output text'
+        ).stdout.strip()
+        node_sg = run(
+            f"aws ec2 describe-instances --instance-ids {node_instance_id} --query 'Reservations[0].Instances[0].SecurityGroups[0].GroupId' --output text"
+        ).stdout.strip()
+        # allow traffic from ALB to nodes on port 8000
+        run(
+            f"aws ec2 authorize-security-group-ingress --group-id {node_sg} --protocol tcp --port 8000 --source-group {alb_sg}"
+        )
+        # update the sg in the ingress file
+        run(
+            f"sed -i 's|<sg-id>|{alb_sg}|g' test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml"
+        )
+
+        # verify sg were created and configured correctly
+        logger.info("Verifying security group configurations...")
+
+        # verify ALB sg
+        alb_sg_result = run(
+            f'aws ec2 describe-security-groups --group-ids {alb_sg} --query "SecurityGroups[0].IpPermissions"'
+        )
+        if "80" not in alb_sg_result.stdout:
+            raise Exception("ALB security group not configured correctly - missing port 80 rule")
+        logger.info("ALB security group configured correctly")
+
+        # verify node sg rules
+        node_sg_result = run(
+            f'aws ec2 describe-security-groups --group-ids {node_sg} --query "SecurityGroups[0].IpPermissions"'
+        )
+        if "8000" not in node_sg_result.stdout:
+            raise Exception("Node security group not configured correctly - missing port 8000 rule")
+
+        logger.info("Node security group configured correctly")
+
+        logger.info("Load Balancer Controller setup and verification completed")
+
+    def cleanup_resources(self):
+        logger.info("Running cleanup script...")
+        try:
+            script_path = "test/vllm_tests/infra/test_vllm_eks_cleanup.sh"
+            run(f"chmod +x {script_path}")
+            run(f"echo 'y' | {script_path}", check=False, timeout=3600)
+            logger.info("Cleanup completed successfully")
+        except Exception as e:
+            logger.error(f"Cleanup failed: {e}")
+
+    def setup_iam_identity(self):
+        logger.info("Setting up IAM identity mapping...")
+
+        try:
+            sts_client = boto3.client("sts")
+            identity = sts_client.get_caller_identity()
+            codebuild_role_arn = identity["Arn"]
+
+            os.environ["EKS_TEST_ROLE"] = codebuild_role_arn
+            os.environ["AWS_REGION"] = self.region
+
+            run(f"bash eks_infrastructure/add_iam_identity.sh {self.cluster_name}")
+            logger.info("IAM identity mapping completed successfully")
+        except Exception as e:
+            logger.error(f"Failed to setup IAM identity mapping: {e}")
+            raise
+
+    def cleanup_infrastructure(self):
+        try:
+            self.cleanup_resources()
+        except Exception as e:
+            logger.error(f"Infrastructure cleanup failed: {e}")
diff --git a/test/vllm_tests/infra/test_vllm_eks_cleanup.sh b/test/vllm_tests/infra/test_vllm_eks_cleanup.sh
new file mode 100755
index 000000000000..df06a7d2429b
--- /dev/null
+++ b/test/vllm_tests/infra/test_vllm_eks_cleanup.sh
@@ -0,0 +1,325 @@
+#!/bin/bash
+# Cleanup script for vLLM DeepSeek 32B deployment on EKS
+# This script deletes all resources created for the vLLM deployment
+# with appropriate wait times to ensure proper deletion
+
+set -e  # Exit on error
+set -o pipefail  # Exit if any command in a pipe fails
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# AWS Profile to use
+REGION="us-west-2"
+CLUSTER_NAME="vllm-cluster"
+NODEGROUP_NAME="vllm-p4d-nodes-efa"
+
+# Function to print section headers
+print_section() {
+    echo -e "\n${BLUE}=== $1 ===${NC}"
+}
+
+# Function to print success messages
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+
+# Function to print warning messages
+print_warning() {
+    echo -e "${YELLOW}⚠ $1${NC}"
+}
+
+# Function to print error messages
+print_error() {
+    echo -e "${RED}✗ $1${NC}"
+}
+
+# Function to wait for a resource to be deleted
+wait_for_deletion() {
+    local check_command="$1"
+    local resource_name="$2"
+    local timeout_seconds="$3"
+    local start_time=$(date +%s)
+    local end_time=$((start_time + timeout_seconds))
+    
+    echo -e "${YELLOW}Waiting for $resource_name to be deleted (timeout: ${timeout_seconds}s)...${NC}"
+    
+    while true; do
+        if ! eval "$check_command" &>/dev/null; then
+            print_success "$resource_name deleted successfully"
+            return 0
+        fi
+        
+        current_time=$(date +%s)
+        if [ $current_time -gt $end_time ]; then
+            print_warning "$resource_name deletion timed out after ${timeout_seconds}s"
+            return 1
+        fi
+        
+        echo -n "."
+        sleep 10
+    done
+}
+
+# Function to check if a command exists
+command_exists() {
+    command -v "$1" &> /dev/null
+}
+
+# Check for required tools
+for cmd in kubectl aws eksctl helm; do
+    if ! command_exists $cmd; then
+        print_error "Required command '$cmd' not found. Please install it and try again."
+        exit 1
+    fi
+done
+
+# Confirm with the user
+echo -e "${RED}WARNING: This script will delete all resources related to the vLLM deployment.${NC}"
+echo -e "${RED}This action is irreversible and will result in data loss.${NC}"
+read -p "Are you sure you want to proceed? (y/N): " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+    echo "Cleanup cancelled."
+    exit 0
+fi
+
+# Store security group IDs for later use
+print_section "Retrieving security group IDs"
+echo "Getting ALB security group ID..."
+ALB_SG=$(kubectl get ingress vllm-deepseek-32b-lws-ingress -o jsonpath='{.metadata.annotations.alb\.ingress\.kubernetes\.io/security-groups}' 2>/dev/null || echo "")
+if [ -z "$ALB_SG" ]; then
+    print_warning "Could not retrieve ALB security group ID from ingress. Will try to find it later."
+fi
+
+
+
+echo "Getting FSx security group ID..."
+FSX_ID=$(kubectl get pv fsx-lustre-pv -o jsonpath='{.spec.csi.volumeHandle}' 2>/dev/null | cut -d'/' -f1 || echo "")
+if [ -n "$FSX_ID" ]; then
+    echo "Found FSx filesystem ID: $FSX_ID"
+    SG_ID=$(aws fsx describe-file-systems --file-system-id $FSX_ID --query "FileSystems[0].NetworkInterfaceIds[0]" --output text 2>/dev/null | xargs -I{} aws ec2 describe-network-interfaces --network-interface-ids {} --query "NetworkInterfaces[0].Groups[0].GroupId" --output text 2>/dev/null || echo "")
+    if [ -n "$SG_ID" ]; then
+        echo "Found FSx security group ID: $SG_ID"
+    else
+        print_warning "Could not retrieve FSx security group ID."
+    fi
+else
+    print_warning "Could not retrieve FSx filesystem ID."
+fi
+
+echo "Getting Node security group ID..."
+NODE_SG=$(aws ec2 describe-security-groups --filters "Name=tag:aws:cloudformation:logical-id,Values=NodeSecurityGroup" "Name=tag:eks:cluster-name,Values=$CLUSTER_NAME" --query "SecurityGroups[0].GroupId" --output text 2>/dev/null || echo "")
+if [ -n "$NODE_SG" ]; then
+    echo "Found Node security group ID: $NODE_SG"
+else
+    print_warning "Could not retrieve Node security group ID."
+fi
+
+echo "Getting VPC ID from the EKS cluster..."
+VPC_ID=$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.resourcesVpcConfig.vpcId" --output text 2>/dev/null || echo "")
+if [ -n "$VPC_ID" ]; then
+    echo "Found VPC ID: $VPC_ID"
+    
+
+else
+    print_warning "Could not retrieve VPC ID from the EKS cluster."
+fi
+
+# 1. Delete Kubernetes Resources
+print_section "Deleting Kubernetes Resources"
+
+echo "Deleting vLLM ingress..."
+kubectl delete -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml --ignore-not-found
+print_success "Ingress deletion initiated"
+
+echo "Waiting 30 seconds for ingress controller to process deletion..."
+sleep 30
+
+echo "Deleting vLLM LeaderWorkerSet..."
+kubectl delete -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml --ignore-not-found
+print_success "LeaderWorkerSet deletion initiated"
+
+echo "Waiting 60 seconds for pods to terminate..."
+sleep 60
+
+echo "Deleting FSx Lustre PVC..."
+kubectl delete -f test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml --ignore-not-found
+print_success "PVC deletion initiated"
+
+echo "Waiting 10 seconds for PVC deletion to process..."
+sleep 10
+
+echo "Deleting FSx Lustre PV..."
+kubectl delete -f test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml --ignore-not-found
+print_success "PV deletion initiated"
+
+echo "Waiting 10 seconds for PV deletion to process..."
+sleep 10
+
+echo "Deleting storage class..."
+kubectl delete -f test/vllm_tests/test_artifacts/fsx-storage-class.yaml --ignore-not-found
+print_success "Storage class deletion initiated"
+
+echo "Deleting AWS Load Balancer Controller..."
+helm uninstall aws-load-balancer-controller -n kube-system --ignore-not-found
+print_success "AWS Load Balancer Controller deletion initiated"
+
+echo "Waiting 60 seconds for controller termination..."
+sleep 60
+
+echo "Verifying all resources are deleted..."
+kubectl get pods,svc,ingress,pv,pvc
+print_success "Kubernetes resource deletion completed"
+
+# 2. Delete the IAM Service Account CloudFormation Stack
+print_section "Deleting IAM Service Account CloudFormation Stack"
+
+STACK_NAME="eksctl-${CLUSTER_NAME}-addon-iamserviceaccount-kube-system-aws-load-balancer-controller"
+echo "Deleting CloudFormation stack: $STACK_NAME"
+aws cloudformation delete-stack --stack-name $STACK_NAME 2>/dev/null || true
+
+wait_for_deletion "aws cloudformation describe-stacks --stack-name $STACK_NAME" "IAM Service Account CloudFormation Stack" 300
+print_success "IAM Service Account CloudFormation Stack deletion completed"
+
+# 3. Delete the IAM Policy
+print_section "Deleting IAM Policy"
+
+echo "Getting the ARN of the IAM policy..."
+POLICY_ARN=$(aws iam list-policies --query "Policies[?PolicyName=='AWSLoadBalancerControllerIAMPolicy'].Arn" --output text)
+
+if [ -n "$POLICY_ARN" ] && [ "$POLICY_ARN" != "None" ]; then
+    echo "Deleting IAM policy: $POLICY_ARN"
+    aws iam delete-policy --policy-arn $POLICY_ARN
+    print_success "IAM policy deleted"
+else
+    print_warning "IAM policy not found or already deleted"
+fi
+
+# 4. Delete the FSx Lustre Filesystem
+print_section "Deleting FSx Lustre Filesystem"
+
+if [ -n "$FSX_ID" ]; then
+    echo "Deleting FSx Lustre filesystem: $FSX_ID"
+    aws fsx delete-file-system --file-system-id $FSX_ID 2>/dev/null || true
+    
+    wait_for_deletion "aws fsx describe-file-systems --file-system-id $FSX_ID" "FSx Lustre filesystem" 600
+    print_success "FSx Lustre filesystem deletion completed"
+else
+    print_warning "FSx Lustre filesystem ID not found or already deleted"
+fi
+
+# 5. Check for Any Remaining Load Balancers
+print_section "Checking for Remaining Load Balancers"
+
+echo "Checking for ALBs and NLBs..."
+aws elbv2 describe-load-balancers --query "LoadBalancers[?contains(DNSName, '${CLUSTER_NAME}')].LoadBalancerArn" --output text | while read -r lb_arn; do
+    if [ -n "$lb_arn" ]; then
+        echo "Deleting load balancer: $lb_arn"
+        aws elbv2 delete-load-balancer --load-balancer-arn $lb_arn
+    fi
+done
+
+echo "Checking for Classic ELBs..."
+aws elb describe-load-balancers --query "LoadBalancerDescriptions[?contains(DNSName, '${CLUSTER_NAME}')].LoadBalancerName" --output text | while read -r lb_name; do
+    if [ -n "$lb_name" ]; then
+        echo "Deleting classic load balancer: $lb_name"
+        aws elb delete-load-balancer --load-balancer-name $lb_name
+    fi
+done
+
+print_success "Load balancer cleanup completed"
+
+# 6. Delete the Node Group
+print_section "Deleting Node Group"
+
+# Check if node group exists before attempting to delete it
+echo "Checking if node group exists: $NODEGROUP_NAME"
+if eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION &>/dev/null; then
+    echo "Node group exists. Deleting node group: $NODEGROUP_NAME"
+    eksctl delete nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION --drain=false
+    
+    wait_for_deletion "eksctl get nodegroup --cluster=$CLUSTER_NAME --name=$NODEGROUP_NAME --region=$REGION" "Node group" 1100
+    print_success "Node group deletion completed"
+else
+    print_warning "Node group $NODEGROUP_NAME not found or already deleted"
+fi
+
+# 7. Delete the Security Groups
+print_section "Deleting Security Groups"
+
+# Delete security groups in the recommended order: FSx SG -> Node SG -> ALB SG
+
+if [ -n "$SG_ID" ]; then
+    echo "Deleting FSx security group: $SG_ID"
+    aws ec2 delete-security-group --group-id $SG_ID 2>/dev/null || print_warning "Failed to delete FSx security group"
+    if [ $? -eq 0 ]; then
+        print_success "FSx security group deleted"
+    fi
+else
+    print_warning "FSx security group ID not found or already deleted"
+fi
+
+echo "Waiting 30 seconds after FSx security group deletion"
+sleep 30
+
+if [ -n "$NODE_SG" ]; then
+    echo "Deleting Node security group: $NODE_SG"
+    aws ec2 delete-security-group --group-id $NODE_SG 2>/dev/null || print_warning "Failed to delete Node security group"
+    if [ $? -eq 0 ]; then
+        print_success "Node security group deleted"
+    fi
+else
+    print_warning "Node security group ID not found or already deleted"
+fi
+
+echo "Waiting 30 seconds after Node security group deletion"
+sleep 30
+
+
+if [ -n "$ALB_SG" ]; then
+    echo "Deleting ALB security group: $ALB_SG"
+    aws ec2 delete-security-group --group-id $ALB_SG 2>/dev/null || print_warning "Failed to delete ALB security group"
+    if [ $? -eq 0 ]; then
+        print_success "ALB security group deleted"
+    fi
+else
+    print_warning "ALB security group ID not found or already deleted"
+fi
+
+echo "Waiting 30 seconds after ALB security group deletion"
+sleep 30
+
+# 8. Delete the EKS Cluster
+print_section "Deleting EKS Cluster"
+
+echo "Deleting EKS cluster: $CLUSTER_NAME"
+eksctl delete cluster --name=$CLUSTER_NAME --region=$REGION
+
+wait_for_deletion "aws eks describe-cluster --name $CLUSTER_NAME" "EKS cluster" 1100
+print_success "EKS cluster deletion completed"
+
+# 9. Final Verification
+print_section "Final Verification"
+
+echo "Checking for any remaining CloudFormation stacks..."
+REMAINING_STACKS=$(aws cloudformation list-stacks --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE DELETE_FAILED --query "StackSummaries[?contains(StackName, '${CLUSTER_NAME}')].StackName" --output text)
+
+if [ -n "$REMAINING_STACKS" ]; then
+    print_warning "Some CloudFormation stacks still exist:"
+    echo "$REMAINING_STACKS"
+    echo
+    echo "You may need to manually delete these stacks or troubleshoot deletion failures."
+    echo "See the README.md section on 'Troubleshooting CloudFormation Stack Deletion Failures'."
+else
+    print_success "No remaining CloudFormation stacks found"
+fi
+
+print_section "Cleanup Complete"
+echo "All resources related to the vLLM deployment have been deleted or cleanup has been initiated."
+echo "Some AWS resources may still be in the process of being deleted."
+echo "Please check the AWS Management Console to verify all resources have been properly cleaned up."
\ No newline at end of file
diff --git a/test/vllm_tests/infra/utils/__init__.py b/test/vllm_tests/infra/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/vllm_tests/infra/utils/fsx_utils.py b/test/vllm_tests/infra/utils/fsx_utils.py
new file mode 100644
index 000000000000..a92551d83a4b
--- /dev/null
+++ b/test/vllm_tests/infra/utils/fsx_utils.py
@@ -0,0 +1,250 @@
+import logging
+import time
+import boto3
+from invoke import run
+from typing import Dict, List, Any
+
+logger = logging.getLogger(__name__)
+
+class FsxSetup:
+    """
+    A utility class for setting up and managing FSx for Lustre filesystems
+    and related AWS and Kubernetes resources.
+
+    : param region: AWS region where resources will be created (default: "us-west-2")
+    """
+    def __init__(self, region: str = "us-west-2"):
+        self.region = region
+        self.fsx_client = boto3.client('fsx', region_name=region)
+        self.ec2_client = boto3.client('ec2', region_name=region)
+
+    def create_fsx_filesystem(
+        self,
+        subnet_id: str,
+        security_group_ids: List[str],
+        storage_capacity: int,
+        deployment_type: str,
+        tags: Dict[str, str],
+    ):
+        """
+        Create FSx Lustre filesystem with given configuration
+        : param subnet_id: subnet ID where FSx will be created
+        : param security_group_ids: list of security group IDs
+        : param storage_capacity: storage capacity in GiB
+        : param deployment_type: FSx deployment type
+        : param tags: dictionary of tags to apply to the FSx filesystem
+        : return: dictionary containing filesystem details
+        """
+        try:
+            response = self.fsx_client.create_file_system(
+                FileSystemType='LUSTRE',
+                StorageCapacity=storage_capacity,
+                SubnetIds=[subnet_id],
+                SecurityGroupIds=security_group_ids,
+                LustreConfiguration={'DeploymentType': deployment_type},
+                Tags=[{'Key': k, 'Value': v} for k, v in tags.items()]
+            )
+            
+            filesystem_id = response['FileSystem']['FileSystemId']
+            logger.info(f"Created FSx filesystem: {filesystem_id}")
+            
+            return self.wait_for_filesystem(filesystem_id)
+            
+        except Exception as e:
+            logger.error(f"Failed to create FSx filesystem: {e}")
+            raise
+
+    def wait_for_filesystem(self, filesystem_id: str):
+        """
+        Wait for FSx filesystem to become available and return its details
+        : param filesystem_id: FSx filesystem ID
+        : return: dictionary containing filesystem details (filesystem_id, dns_name, mount_name)
+        : raises: Exception if filesystem enters FAILED, DELETING, or DELETED state
+        """
+        logger.info(f"Waiting for FSx filesystem {filesystem_id} to be available...")
+        
+        try:
+            waiter = self.fsx_client.get_waiter('file_system_available')
+            waiter.wait(
+                FileSystemIds=[filesystem_id],
+                WaiterConfig={'Delay': 30, 'MaxAttempts': 60}
+            )
+
+            # Get filesystem details
+            response = self.fsx_client.describe_file_systems(
+                FileSystemIds=[filesystem_id]
+            )
+            filesystem = response['FileSystems'][0]
+
+            return {
+                'filesystem_id': filesystem_id,
+                'dns_name': filesystem['DNSName'],
+                'mount_name': filesystem['LustreConfiguration']['MountName']
+            }
+
+        except Exception as e:
+            logger.error(f"Error waiting for filesystem {filesystem_id}: {e}")
+            raise
+
+    def create_security_group(
+        self,
+        vpc_id: str,
+        name: str,
+        description: str
+    ):
+        """
+        Create a security group in the specified VPC
+        : param vpc_id: VPC ID where the security group will be created
+        : param name: name of the security group
+        : param description: description of the security group
+        : return: created security group ID
+        : raises: Exception if security group creation fails
+        """
+        try:
+            response = self.ec2_client.create_security_group(
+                GroupName=name,
+                Description=description,
+                VpcId=vpc_id
+            )
+            sg_id = response['GroupId']
+            logger.info(f"Created security group: {sg_id}")
+            return sg_id
+
+        except Exception as e:
+            logger.error(f"Failed to create security group: {e}")
+            raise
+
+    def add_security_group_ingress_rules(
+        self,
+        security_group_id: str,
+        ingress_rules: List[Dict[str, Any]]
+    ):
+        """
+        Add ingress rules to an existing security group
+        : param security_group_id: ID of the security group to modify
+        : param ingress_rules: list of dictionaries containing ingress rule configurations
+                            Example: [{"protocol": "tcp", "port": "988-1023", "source-group": "sg-xxx"}]
+        : return: None
+        : raises: Exception if adding ingress rules fails
+        """
+        try:
+            ip_permissions = []
+            for rule in ingress_rules:
+                from_port, to_port = map(int, rule['port'].split('-'))
+                permission = {
+                    'IpProtocol': rule['protocol'],
+                    'FromPort': from_port,
+                    'ToPort': to_port,
+                    'UserIdGroupPairs': [{
+                        'GroupId': rule['source-group']
+                    }]
+                }
+                ip_permissions.append(permission)
+
+            self.ec2_client.authorize_security_group_ingress(
+                GroupId=security_group_id,
+                IpPermissions=ip_permissions
+            )
+            
+            logger.info(f"Added ingress rules to security group: {security_group_id}")
+
+        except Exception as e:
+            logger.error(f"Failed to add ingress rules to security group: {e}")
+            raise
+
+    def setup_csi_driver(self):
+        """
+        Install and configure the AWS FSx CSI Driver in the Kubernetes cluster
+        : return: None           
+        : raises: Exception if driver installation or verification fails
+        """
+        try:
+            logger.info("Installing AWS FSx CSI Driver...")
+            run("helm repo add aws-fsx-csi-driver https://kubernetes-sigs.github.io/aws-fsx-csi-driver/")
+            run("helm repo update")
+            run("helm install aws-fsx-csi-driver aws-fsx-csi-driver/aws-fsx-csi-driver --namespace kube-system")
+            run("kubectl wait --for=condition=ready pod -l app=fsx-csi-controller -n kube-system --timeout=300s")
+            
+            self._verify_csi_driver()
+            logger.info("FSx CSI Driver installed successfully")
+        except Exception as e:
+            logger.error(f"Failed to setup FSx CSI driver: {e}")
+            raise
+
+    def _verify_csi_driver(self):
+        """
+        Verify that FSx CSI driver pods are running correctly in the cluster
+        : return: None
+        : raises: Exception if driver pods are not found or not running
+        """
+        result = run("kubectl get pods -n kube-system | grep fsx")
+        
+        if "fsx-csi-controller" not in result.stdout or "fsx-csi-node" not in result.stdout:
+            raise Exception("FSx CSI driver pods not found")
+
+        fsx_pods = [
+            line for line in result.stdout.split("\n")
+            if ("fsx-csi-controller" in line or "fsx-csi-node" in line) and "Running" in line
+        ]
+        
+        if not fsx_pods:
+            raise Exception("No running FSx CSI driver pods found")
+        
+        logger.info(f"Found {len(fsx_pods)} running FSx CSI driver pods")
+
+    def setup_kubernetes_resources(
+        self,
+        storage_class_file: str,
+        pv_file: str,
+        pvc_file: str,
+        replacements: Dict[str, str]
+    ):
+        """
+        Setup Kubernetes FSx resources using provided yaml files and replacements
+        : param storage_class_file: path to the storage class yaml file
+        : param pv_file: path to the persistent volume yaml file
+        : param pvc_file: path to the persistent volume claim yaml file
+        : param replacements: dictionary of placeholder replacements
+                            Example: {"<subnet-id>": "subnet-xxx", "<sg-id>": "sg-xxx"}
+        : return: None
+        : raises: Exception if resource creation fails
+        """
+        try:
+            for file_path in [storage_class_file, pv_file, pvc_file]:
+                for key, value in replacements.items():
+                    run(f"sed -i 's|{key}|{value}|g' {file_path}")
+
+            for file_path in [storage_class_file, pv_file, pvc_file]:
+                run(f"kubectl apply -f {file_path}")
+
+            self.validate_kubernetes_resources()
+            
+        except Exception as e:
+            logger.error(f"Failed to setup Kubernetes FSx resources: {e}")
+            raise
+
+    def validate_kubernetes_resources(self):
+        """
+        Validate that FSx Kubernetes resources are properly created and bound
+        : return: True if all resources are validated successfully
+        : raises: Exception if any resource validation fails
+        """
+        try:
+            sc_result = run("kubectl get sc fsx-sc")
+            if "fsx-sc" not in sc_result.stdout or "fsx.csi.aws.com" not in sc_result.stdout:
+                raise Exception("FSx storage class not created correctly")
+
+            pv_result = run("kubectl get pv fsx-lustre-pv")
+            if "fsx-lustre-pv" not in pv_result.stdout or "Bound" not in pv_result.stdout:
+                raise Exception("FSx persistent volume not created correctly")
+
+            pvc_result = run("kubectl get pvc fsx-lustre-pvc")
+            if "fsx-lustre-pvc" not in pvc_result.stdout or "Bound" not in pvc_result.stdout:
+                raise Exception("FSx persistent volume claim not created correctly")
+
+            logger.info("FSx Kubernetes resources validated successfully")
+            return True
+
+        except Exception as e:
+            logger.error(f"FSx resource validation failed: {e}")
+            raise
\ No newline at end of file
diff --git a/test/vllm_tests/test_artifacts/__init__.py b/test/vllm_tests/test_artifacts/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/vllm_tests/test_artifacts/eks-cluster.yaml b/test/vllm_tests/test_artifacts/eks-cluster.yaml
new file mode 100644
index 000000000000..6e59c29bc804
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/eks-cluster.yaml
@@ -0,0 +1,23 @@
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: vllm-cluster
+  region: us-west-2
+  version: "1.31" # Latest stable EKS version
+
+# Enable CloudWatch logging
+cloudWatch:
+  clusterLogging:
+    enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"]
+
+# Add-ons for the cluster
+addons:
+  - name: vpc-cni
+    version: latest
+  - name: coredns
+    version: latest
+  - name: kube-proxy
+    version: latest
+  - name: aws-ebs-csi-driver
+    version: latest
diff --git a/test/vllm_tests/test_artifacts/eks_test.py b/test/vllm_tests/test_artifacts/eks_test.py
new file mode 100644
index 000000000000..2051eea40554
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/eks_test.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+import logging
+import time
+from invoke import run
+
+logger = logging.getLogger(__name__)
+
+class VllmEksTest:
+    def __init__(self):
+        pass
+
+
+    def run_tests(self):
+        try:
+            logger.info("Starting vLLM EKS integration tests...")
+            self.deploy_vllm_service()
+            self.test_vllm_api()
+            logger.info("All vLLM EKS tests completed successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Test execution failed: {e}")
+            return False
+
+
+    def deploy_vllm_service(self):
+        logger.info("Deploying vLLM service...")
+
+        self._wait_for_load_balancer_controller()
+        
+        logger.info("Applying vLLM LeaderWorkerSet configuration...")
+        run("kubectl apply -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml")
+        
+        logger.info("Applying vLLM ingress configuration...")
+        run("kubectl apply -f test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml")
+        
+        self._wait_for_vllm_pods()
+        
+        logger.info("vLLM service deployed successfully")
+    
+
+    def _wait_for_load_balancer_controller(self):
+        logger.info("Waiting for AWS Load Balancer Controller to be ready...")
+        max_retries = 20  # 10 minutes total
+        retry_count = 0
+        
+        while retry_count < max_retries:
+            result = run("kubectl get pods -n kube-system | grep aws-load-balancer-controller", warn=True)
+            if "aws-load-balancer-controller" in result.stdout:
+                all_alb_pods = [
+                    line for line in result.stdout.split("\n")
+                    if "aws-load-balancer-controller" in line and line.strip()
+                ]
+                running_alb_pods = [
+                    line for line in all_alb_pods if "Running" in line
+                ]
+                if all_alb_pods and len(running_alb_pods) == len(all_alb_pods):
+                    logger.info(f"All {len(running_alb_pods)} AWS Load Balancer Controller pods are running")
+                    return
+                else:
+                    logger.info(f"ALB controller pods: {len(running_alb_pods)}/{len(all_alb_pods)} running")
+            
+            retry_count += 1
+            logger.info(f"ALB controller not ready yet, waiting... (attempt {retry_count}/{max_retries})")
+            time.sleep(30)
+        
+        raise Exception("AWS Load Balancer Controller pods failed to start after 10 minutes")
+    
+
+    def _wait_for_vllm_pods(self):
+        logger.info("Waiting for vLLM pods to reach Running status...")
+        logger.info("This may take 15-30 minutes for container image pull and model loading")
+        
+        max_retries = 60  # 30 minutes total
+        retry_count = 0
+        
+        while retry_count < max_retries:
+            result = run("kubectl get pods -l app=vllm-deepseek-32b-lws", warn=True)
+            if "vllm-deepseek-32b-lws" in result.stdout:
+                all_vllm_pods = [
+                    line for line in result.stdout.split("\n")
+                    if "vllm-deepseek-32b-lws" in line and line.strip() and "NAME" not in line
+                ]
+                running_vllm_pods = [
+                    line for line in all_vllm_pods if "Running" in line
+                ]
+                if all_vllm_pods and len(running_vllm_pods) == len(all_vllm_pods):
+                    logger.info(f"All {len(running_vllm_pods)} vLLM pods are running")
+                    return
+                else:
+                    statuses = []
+                    for line in all_vllm_pods:
+                        parts = line.split()
+                        if len(parts) >= 3:
+                            pod_name = parts[0]
+                            status = parts[2]
+                            statuses.append(f"{pod_name}: {status}")
+                    logger.info(f"vLLM pods status: {', '.join(statuses)}")
+            
+            retry_count += 1
+            logger.info(f"vLLM pods not ready yet, waiting... (attempt {retry_count}/{max_retries})")
+            time.sleep(30)
+        
+        raise Exception("vLLM pods failed to reach Running status after 30 minutes")
+
+
+    def test_vllm_api(self):
+        logger.info("Testing vLLM API...")
+        endpoint = run(
+            "kubectl get ingress vllm-deepseek-32b-lws-ingress -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'"
+        ).stdout.strip()
+        logger.info(f"vLLM API endpoint: {endpoint}")
+
+        if not endpoint:
+            raise Exception("Failed to get vLLM API endpoint from ingress")
+
+        self._test_completions_api(endpoint)
+        self._test_chat_completions_api(endpoint)
+        logger.info("All vLLM API tests passed successfully")
+    
+
+    def _test_completions_api(self, endpoint):
+        logger.info("Testing completions API...")
+        result = run(
+            f"""curl -X POST http://{endpoint}/v1/completions \
+                -H "Content-Type: application/json" \
+                -d '{{"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "prompt": "Hello, how are you?", "max_tokens": 50, "temperature": 0.7}}'
+            """
+        )
+        assert '"object":"text_completion"' in result.stdout, "vLLM completions API test failed"
+        logger.info("Completions API test passed")
+    
+
+    def _test_chat_completions_api(self, endpoint):
+        logger.info("Testing chat completions API...")
+        result = run(
+            f"""curl -X POST http://{endpoint}/v1/chat/completions \
+                -H "Content-Type: application/json" \
+                -d '{{"model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "messages": [{{"role": "user", "content": "What are the benefits of using FSx Lustre with EKS?"}}], "max_tokens": 100, "temperature": 0.7}}'
+            """
+        )
+        assert '"object":"chat.completion"' in result.stdout, "vLLM chat completions API test failed"
+        logger.info("Chat completions API test passed")
\ No newline at end of file
diff --git a/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml b/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml
new file mode 100644
index 000000000000..d94ba75ad632
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/fsx-lustre-pv.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: fsx-lustre-pv
+spec:
+  capacity:
+    storage: 1200Gi  # Adjust based on your FSx Lustre filesystem size
+  volumeMode: Filesystem
+  accessModes:
+    - ReadWriteMany
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: fsx-sc
+  csi:
+    driver: fsx.csi.aws.com
+    volumeHandle: <fs-id>  # FSx Lustre filesystem ID
+    volumeAttributes:
+      dnsname: <fs-id>.fsx.us-west-2.amazonaws.com  # FSx Lustre DNS name
+      mountname: <mount-name>  # The mount name of your FSx Lustre filesyst
diff --git a/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml b/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml
new file mode 100644
index 000000000000..f03c420f864c
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/fsx-lustre-pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-lustre-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: fsx-sc
+  resources:
+    requests:
+      storage: 1200Gi  # Should match the PV capacity
diff --git a/test/vllm_tests/test_artifacts/fsx-storage-class.yaml b/test/vllm_tests/test_artifacts/fsx-storage-class.yaml
new file mode 100644
index 000000000000..31b81d8495ff
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/fsx-storage-class.yaml
@@ -0,0 +1,18 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: fsx-sc
+provisioner: fsx.csi.aws.com
+parameters:
+  subnetId: <subnet-id>
+  securityGroupIds: <sg-id>
+  deploymentType: SCRATCH_2
+  automaticBackupRetentionDays: "0"
+  dailyAutomaticBackupStartTime: "00:00"
+  copyTagsToBackups: "false"
+  perUnitStorageThroughput: "50"
+  dataCompressionType: "NONE"
+reclaimPolicy: Retain
+volumeBindingMode: Immediate
+mountOptions:
+  - flock
diff --git a/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml b/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml
new file mode 100644
index 000000000000..093777fe6b50
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/large-model-nodegroup.yaml
@@ -0,0 +1,53 @@
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: vllm-cluster
+  region: us-west-2
+
+managedNodeGroups:
+  - name: vllm-p4d-nodes-efa
+    instanceType: p4d.24xlarge
+    minSize: 0
+    maxSize: 2
+    desiredCapacity: 2
+    availabilityZones: ["us-west-2a"]  # EFA-enabled nodegroups must have only one subnet or one availability zone
+    volumeSize: 100
+    privateNetworking: true
+    # Use the EKS-optimized GPU AMI
+    ami: ami-01f1fc27c5979ac62  # Amazon EKS GPU node 1.31 (k8s: 1.31.7, containerd: 1.7.*)
+    amiFamily: AmazonLinux2
+    labels:
+      role: large-model-worker
+      nvidia.com/gpu: "true"
+      k8s.amazonaws.com/accelerator: nvidia-gpu
+      aws.amazon.com/efa: "true"  # Add EFA label
+    tags:
+      nodegroup-role: large-model-worker
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+        albIngress: true
+        cloudWatch: true
+        ebs: true
+        imageBuilder: true
+    # Enable EFA interfaces
+    efaEnabled: true
+    # Override bootstrap command for custom AMI
+    overrideBootstrapCommand: |
+      #!/bin/bash
+      set -ex
+      
+      # Install EFA driver and related packages
+      curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
+      tar -xf aws-efa-installer-latest.tar.gz
+      cd aws-efa-installer
+      ./efa_installer.sh -y
+      
+      # Configure NCCL to use EFA
+      echo "export FI_PROVIDER=efa" >> /etc/environment
+      echo "export FI_EFA_USE_DEVICE_RDMA=1" >> /etc/environment
+      echo "export NCCL_DEBUG=INFO" >> /etc/environment
+      
+      # Standard EKS bootstrap
+      /etc/eks/bootstrap.sh vllm-cluster --container-runtime containerd
diff --git a/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml
new file mode 100644
index 000000000000..5b5706eccd88
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws-ingress.yaml
@@ -0,0 +1,28 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: vllm-deepseek-32b-lws-ingress
+  annotations:
+    # Use AWS Load Balancer Controller with ALB
+    alb.ingress.kubernetes.io/scheme: internet-facing
+    alb.ingress.kubernetes.io/target-type: ip
+    alb.ingress.kubernetes.io/security-groups: <sg-id>
+    alb.ingress.kubernetes.io/healthcheck-path: /health
+    alb.ingress.kubernetes.io/healthcheck-port: '8000'
+    alb.ingress.kubernetes.io/healthcheck-protocol: HTTP
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
+    alb.ingress.kubernetes.io/load-balancer-attributes: load_balancing.cross_zone.enabled=true
+    # Specify ALB class
+    kubernetes.io/ingress.class: alb
+spec:
+  ingressClassName: alb
+  rules:
+  - http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: vllm-deepseek-32b-lws-leader
+            port:
+              number: 8000
\ No newline at end of file
diff --git a/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml
new file mode 100644
index 000000000000..d79afe1153af
--- /dev/null
+++ b/test/vllm_tests/test_artifacts/vllm-deepseek-32b-lws.yaml
@@ -0,0 +1,252 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm-deepseek-32b-lws
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2  # Total number of nodes (1 leader + 1 worker)
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.8.5-gpu-py312-ec2
+            securityContext:
+              privileged: true
+              capabilities:
+                add: ["IPC_LOCK"]
+            env:
+              # Ray configuration
+              - name: RAY_DISABLE_RUNTIME_ENV
+                value: "1"
+              - name: RAY_SCHEDULER_EVENTS
+                value: "0"
+              - name: RAY_WORKER_REGISTER_TIMEOUT_SECONDS
+                value: "300"
+              # NCCL configuration for distributed training
+              - name: NCCL_DEBUG
+                value: "INFO"
+              - name: NCCL_IB_DISABLE
+                value: "1"
+              - name: NCCL_P2P_DISABLE
+                value: "1"
+              - name: NCCL_NET_GDR_LEVEL
+                value: "0"
+              - name: NCCL_SHM_DISABLE
+                value: "1"
+              # EFA-specific environment variables
+              - name: FI_PROVIDER
+                value: "efa"
+              - name: FI_EFA_USE_DEVICE_RDMA
+                value: "1"
+              - name: FI_EFA_FORK_SAFE
+                value: "1"
+              # Hugging Face configuration
+              - name: TRANSFORMERS_CACHE
+                value: "/mnt/fsx/models"
+              - name: HF_HOME
+                value: "/mnt/fsx/models"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: huggingface-token
+                    key: token
+                    optional: true
+              # Add host IP for Ray
+              - name: VLLM_HOST_IP
+                valueFrom:
+                  fieldRef:
+                    fieldPath: status.podIP
+            command: ["/bin/bash"]
+            args:
+              - "-c"
+              - |
+                set -x
+
+                # Start ray leader
+                ray start --head --port=6379 --num-cpus=48 --num-gpus=8
+                sleep 10
+                ray status
+                fi_info -p efa
+
+                # Start vllm server
+                python -m vllm.entrypoints.openai.api_server \
+                  --model deepseek-ai/DeepSeek-R1-Distill-Qwen-32B \
+                  --host 0.0.0.0 \
+                  --port 8000 \
+                  --tensor-parallel-size 8 \
+                  --pipeline-parallel-size 2 \
+                  --download-dir /mnt/fsx/models \
+                  --max-model-len 4096 \
+                  --gpu-memory-utilization 0.85
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                cpu: "48"
+                memory: "256Gi"
+                vpc.amazonaws.com/efa: 4
+              requests:
+                nvidia.com/gpu: "8"
+                cpu: "48"
+                memory: "256Gi"
+                vpc.amazonaws.com/efa: 4
+            ports:
+              - containerPort: 8000
+            readinessProbe:
+              httpGet:
+                path: /health
+                port: 8000
+              initialDelaySeconds: 300
+              periodSeconds: 30
+              timeoutSeconds: 10
+              successThreshold: 1
+              failureThreshold: 10
+            volumeMounts:
+              - name: fsx-lustre-volume
+                mountPath: /mnt/fsx
+              # Mount the EFA devices
+              #- name: efa-devices
+              #  mountPath: /dev/infiniband
+              # Mount a larger shared memory volume
+              - name: dshm
+                mountPath: /dev/shm
+        volumes:
+        - name: fsx-lustre-volume
+          persistentVolumeClaim:
+            claimName: fsx-lustre-pvc
+        # Add volume for EFA devices
+        #- name: efa-devices
+        #  hostPath:
+        #    path: /dev/infiniband
+        # Add a larger shared memory volume
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: "30Gi"  # Increase shared memory size
+        nodeSelector:
+          role: large-model-worker
+        # Add tolerations for EFA
+        tolerations:
+        - key: "aws.amazon.com/efa"
+          operator: "Exists"
+          effect: "NoSchedule"
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/vllm:0.8.5-gpu-py312-ec2
+            securityContext:
+              privileged: true
+              capabilities:
+                add: ["IPC_LOCK"]
+            env:
+              # Ray configuration
+              - name: RAY_DISABLE_RUNTIME_ENV
+                value: "1"
+              - name: RAY_SCHEDULER_EVENTS
+                value: "0"
+              - name: RAY_WORKER_REGISTER_TIMEOUT_SECONDS
+                value: "300"
+              # NCCL configuration for distributed training
+              - name: NCCL_DEBUG
+                value: "INFO"
+              - name: NCCL_IB_DISABLE
+                value: "1"
+              - name: NCCL_P2P_DISABLE
+                value: "1"
+              - name: NCCL_NET_GDR_LEVEL
+                value: "0"
+              - name: NCCL_SHM_DISABLE
+                value: "1"
+              # EFA-specific environment variables
+              - name: FI_PROVIDER
+                value: "efa"
+              - name: FI_EFA_USE_DEVICE_RDMA
+                value: "1"
+              - name: FI_EFA_FORK_SAFE
+                value: "1"
+              # Hugging Face configuration
+              - name: TRANSFORMERS_CACHE
+                value: "/mnt/fsx/models"
+              - name: HF_HOME
+                value: "/mnt/fsx/models"
+              - name: HUGGING_FACE_HUB_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: huggingface-token
+                    key: token
+                    optional: true
+              # Add host IP for Ray
+              - name: VLLM_HOST_IP
+                valueFrom:
+                  fieldRef:
+                    fieldPath: status.podIP
+            command: ["/bin/bash"]
+            args:
+              - "-c"
+              - |
+                set -x
+                
+                # Start ray worker
+                ray start --address=$(LWS_LEADER_ADDRESS):6379 --num-cpus=48 --num-gpus=8 --block
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                cpu: "48"
+                memory: "256Gi"
+                vpc.amazonaws.com/efa: 4
+              requests:
+                nvidia.com/gpu: "8"
+                cpu: "48"
+                memory: "256Gi"
+                vpc.amazonaws.com/efa: 4
+            volumeMounts:
+              - name: fsx-lustre-volume
+                mountPath: /mnt/fsx
+              # Mount the EFA devices
+              #- name: efa-devices
+              #  mountPath: /dev/infiniband
+              # Mount a larger shared memory volume
+              - name: dshm
+                mountPath: /dev/shm
+        volumes:
+        - name: fsx-lustre-volume
+          persistentVolumeClaim:
+            claimName: fsx-lustre-pvc
+        # Add volume for EFA devices
+        #- name: efa-devices
+        #  hostPath:
+        #    path: /dev/infiniband
+        # Add a larger shared memory volume
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: "30Gi"  # Increase shared memory size
+        nodeSelector:
+          role: large-model-worker
+        # Add tolerations for EFA
+        tolerations:
+        - key: "aws.amazon.com/efa"
+          operator: "Exists"
+          effect: "NoSchedule"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-deepseek-32b-lws-leader
+spec:
+  ports:
+    - name: port-8000
+      port: 8000
+      targetPort: 8000
+    - name: port-8265
+      port: 8265
+      targetPort: 8265
+  type: ClusterIP
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm-deepseek-32b-lws
+    role: leader
\ No newline at end of file
diff --git a/test/vllm_tests/vllm_test_trigger.py b/test/vllm_tests/vllm_test_trigger.py
new file mode 100644
index 000000000000..8a8b33a7ad80
--- /dev/null
+++ b/test/vllm_tests/vllm_test_trigger.py
@@ -0,0 +1,82 @@
+import os, sys
+import logging
+from typing import List
+
+from test.test_utils import get_dlc_images
+from test.vllm_tests.infra.eks_infra import EksInfrastructure
+from test.vllm_tests.test_artifacts.eks_test import VllmEksTest
+
+LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel(logging.DEBUG)
+LOGGER.addHandler(logging.StreamHandler(sys.stdout))
+
+
+def run_vllm_eks_test():
+    infrastructure = None
+    try:
+        LOGGER.info("Setting up EKS infrastructure...")
+        infrastructure = EksInfrastructure()
+        if not infrastructure.setup_infrastructure():
+            raise Exception("Infrastructure setup failed")
+        LOGGER.info("Infrastructure setup completed successfully")
+
+        LOGGER.info("Starting vLLM tests...")
+        test = VllmEksTest()
+        if not test.run_tests():
+            raise Exception("vLLM tests failed")
+        LOGGER.info("vLLM tests completed successfully")
+        return 0
+
+    except Exception as e:
+        LOGGER.error(f"Test execution failed: {e}")
+        return 1
+
+    finally:
+        if infrastructure:
+            LOGGER.info("Cleaning up infrastructure...")
+            infrastructure.cleanup_infrastructure()
+            LOGGER.info("Cleanup completed")
+
+
+def run_platform_tests(platform: str, images: List[str], commit_id: str, ipv6_enabled: bool):
+    """
+    Run tests for a specific platform
+    """
+    LOGGER.info(f"Running {platform} tests")
+    if platform == "eks":
+        result = run_vllm_eks_test()
+        if result != 0:
+            raise Exception("vLLM EKS tests failed")
+        LOGGER.info("vLLM EKS tests completed successfully")
+
+
+def main():
+    LOGGER.info("Triggering test from vllm")
+    test_type = os.getenv("TEST_TYPE")
+
+    LOGGER.info(f"TEST_TYPE: {test_type}")
+
+    executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true"
+    dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images()
+
+    ipv6_enabled = os.getenv("ENABLE_IPV6_TESTING", "false").lower() == "true"
+    os.environ["ENABLE_IPV6_TESTING"] = "true" if ipv6_enabled else "false"
+
+    commit_id = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", default="unrecognised_commit_id")
+    LOGGER.info(f"Commit ID: {commit_id}")
+
+    LOGGER.info(f"Images tested: {dlc_images}")
+    all_image_list = dlc_images.split(" ")
+    standard_images_list = [image_uri for image_uri in all_image_list if "example" not in image_uri]
+    LOGGER.info(f"\nImages URIs:\n{standard_images_list}")
+
+    run_platform_tests(
+        platform=test_type,
+        images=standard_images_list,
+        commit_id=commit_id,
+        ipv6_enabled=ipv6_enabled,
+    )
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/testspec.yml b/testspec.yml
index 31abd98c35dd..fffb6658d4a4 100644
--- a/testspec.yml
+++ b/testspec.yml
@@ -23,7 +23,8 @@ phases:
       - pip install scheduler/.
       - echo Running pytest $TEST_TYPE tests on $DLC_IMAGES...
       - export PYTHONPATH=$PYTHONPATH:$(pwd)/src
-      - python test/testrunner.py
+      # - python test/testrunner.py
+      - python test/vllm_tests/vllm_test_trigger.py
   post_build:
     commands:
       - python src/send_status.py --status $CODEBUILD_BUILD_SUCCEEDING
diff --git a/vllm/buildspec.yml b/vllm/buildspec.yml
index 1de64837c3c0..1fbba0e18540 100644
--- a/vllm/buildspec.yml
+++ b/vllm/buildspec.yml
@@ -49,3 +49,4 @@ images:
       test_platforms:
         - sanity
         - security
+        - eks
\ No newline at end of file