Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9259f01
A4 on staging support added
sharabiani Feb 5, 2025
61683a4
Merge branch 'develop' into a4-preview
sharabiani Feb 15, 2025
13f0164
A key for gke endpoint added to configs
sharabiani Feb 15, 2025
5b73273
get_cluster_credentials moved to kubectl module
sharabiani Feb 15, 2025
b81a7f3
common module added to core
sharabiani Feb 15, 2025
9844a73
common module file added to core
sharabiani Feb 15, 2025
7fb020a
A4 blueprnt updated
sharabiani Feb 15, 2025
1c38e61
Merge branch 'develop' into HEAD
gcie Mar 7, 2025
5a3d19d
update ctk to 1.47.0
gcie Mar 7, 2025
a07df4c
Merge branch 'develop' into a4-preview
gcie Mar 7, 2025
d72fbc7
fix merge issues
gcie Mar 7, 2025
c011820
fix unittests: a3ultra template
gcie Mar 7, 2025
a15dcce
fix unittests: missing slash
gcie Mar 7, 2025
4b06518
a4 prod support
gcie Mar 10, 2025
3b71a1d
Merge branch 'develop' into a4-preview
gcie Mar 10, 2025
2db662e
fix: linting
gcie Mar 10, 2025
f6615e1
update cluster toolkit to 1.47.0
gcie Mar 10, 2025
29330f3
update missing ctk version in blueprint_generator.py
gcie Mar 10, 2025
2ace838
fix: missing 'v'
gcie Mar 10, 2025
ca90a3e
Merge branch 'gcie-ctk-update' into a4-preview
gcie Mar 11, 2025
91787f3
Merge branch 'develop' into a4-preview
gcie Mar 26, 2025
8124814
Merge branch 'develop' into a4-preview
gcie Mar 26, 2025
7da8e6b
remove -lowmem from b200 system characteristics
gcie Mar 26, 2025
eb7a8df
fix linting
gcie Mar 26, 2025
643e6a4
fix linting, again
gcie Mar 26, 2025
9daf920
fix integration tests
gcie Mar 26, 2025
5b10533
add a4 tests (unit+integration)
gcie Mar 26, 2025
830f4b3
fix workloads on a4
gcie Mar 26, 2025
4f391ab
fix: invalid annotation bug
gcie Mar 26, 2025
a495567
remove unused artifact
gcie Mar 26, 2025
2bdc5a1
add rdma annotations for A4 in slurm mode
gcie Mar 27, 2025
f6f75c5
remove unused kueue resources quotas
gcie Mar 28, 2025
b916c0f
Merge branch 'develop' into a4-preview
gcie Apr 1, 2025
79d7dd9
bring back removed resources
gcie Apr 2, 2025
0653eed
review fixes
gcie Apr 2, 2025
03843bd
fix tests
gcie Apr 2, 2025
c9686df
set ctk version to 1.48.0, fix a3ultra ctk issue
gcie Apr 10, 2025
74dc87a
fix unittests
gcie Apr 10, 2025
03cbb4c
fixes
gcie Apr 11, 2025
db9b008
Merge branch 'develop' into a4-preview
gcie Apr 11, 2025
c9fcf4b
Merge branch 'develop' into a4-preview
gcie Apr 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: 1xh100-mega-80gb-8
name: "1xh100-mega-80gb-8"
spec:
nodeLabels:
cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
Expand All @@ -27,7 +27,7 @@ spec:
resourceGroups:
- coveredResources: ["nvidia.com/gpu", "cpu", "memory"]
flavors:
- name: 1xh100-mega-80gb-8
- name: "1xh100-mega-80gb-8"
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
Expand Down
6 changes: 3 additions & 3 deletions src/xpk/blueprints/a3ultra/kueue-xpk-configuration.yaml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ spec:
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: 1xh200-141gb-8
name: "1xh200-141gb-8"
spec:
nodeLabels:
cloud.google.com/gke-accelerator: nvidia-h200-141gb
cloud.google.com/gke-accelerator: "nvidia-h200-141gb"
topologyName: "gke-default"
---
apiVersion: kueue.x-k8s.io/v1beta1
Expand All @@ -27,7 +27,7 @@ spec:
resourceGroups:
- coveredResources: ["nvidia.com/gpu", "cpu", "memory"]
flavors:
- name: 1xh200-141gb-8
- name: "1xh200-141gb-8"
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
Expand Down
15 changes: 15 additions & 0 deletions src/xpk/blueprints/a4/config-map.yaml.tftpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
kind: ConfigMap
apiVersion: v1
metadata:
name: ${resource_config_name}
data:
b200-8: "${num_nodes}"
---
kind: ConfigMap
apiVersion: v1
metadata:
name: ${cluster_config_name}
data:
capacity_type: "${capacity_type}"
reservation_id: "${reservation}"
provisioner: gcluster
85 changes: 85 additions & 0 deletions src/xpk/blueprints/a4/kueue-xpk-configuration.yaml.tftpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
apiVersion: kueue.x-k8s.io/v1alpha1
kind: Topology
metadata:
name: "gke-default"
spec:
levels:
- nodeLabel: "cloud.google.com/gce-topology-block"
- nodeLabel: "cloud.google.com/gce-topology-subblock"
- nodeLabel: "cloud.google.com/gce-topology-host"
- nodeLabel: "kubernetes.io/hostname"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: "1xb200-8"
spec:
nodeLabels:
cloud.google.com/gke-accelerator: nvidia-b200
topologyName: "gke-default"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["nvidia.com/gpu", "cpu", "memory"]
flavors:
- name: "1xb200-8"
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
- name: "cpu"
nominalQuota: 10000
- name: "memory"
nominalQuota: 10000Gi
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: default
name: multislice-queue
spec:
clusterQueue: cluster-queue
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-low
value: 100
globalDefault: false
description: "Very Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: low
value: 250
globalDefault: false
description: "Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: medium
value: 500
globalDefault: false
description: "Medium"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: high
value: 750
globalDefault: false
description: "High"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-high
value: 1000
globalDefault: false
description: "Very High"
66 changes: 66 additions & 0 deletions src/xpk/blueprints/a4/nccl-rdma-installer-a4.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nccl-rdma-installer
namespace: kube-system
labels:
k8s-app: nccl-rdma-installer
spec:
selector:
matchLabels:
k8s-app: nccl-rdma-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nccl-rdma-installer
k8s-app: nccl-rdma-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-b200
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: library-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
type: DirectoryOrCreate
- name: gib
hostPath:
path: /home/kubernetes/bin/gib
initContainers:
- image: us-docker.pkg.dev/kernel-net-team/clouda4-nccl-dev/nccl-plugin-gib-diagnostic:v1.0.3-b200
name: nccl-rdma-installer
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: library-dir-host
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
- name: gib
mountPath: /usr/local/home/kubernetes/bin/gib
command: ["/bin/sh", "-c"]
args:
- |
set -ex
/scripts/container_entry.sh install --install-nccl
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
# ibv_devinfo || exit 1
echo "installation finishes"
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause
52 changes: 52 additions & 0 deletions src/xpk/blueprints/a4/storage_crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: storages.xpk.x-k8s.io
spec:
group: xpk.x-k8s.io
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
type:
type: string
cluster:
type: string
auto_mount:
type: boolean
mount_point:
type: string
readonly:
type: boolean
manifest:
type: string
pv:
type: string
pvc:
type: string
required:
- type
- cluster
- auto_mount
- mount_point
- readonly
- manifest
- pvc
- pv
x-kubernetes-validations:
- message: Value is immutable
rule: self == oldSelf
scope: Cluster
names:
plural: storages
singular: storage
kind: Storage
shortNames:
- stg
25 changes: 16 additions & 9 deletions src/xpk/commands/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,26 @@
limitations under the License.
"""

import re
from argparse import Namespace

from ..core.cluster import create_xpk_k8s_service_account
from ..core.cluster import (
create_xpk_k8s_service_account,
get_cluster_credentials,
)
from ..core.commands import run_command_for_value
from ..core.gcloud_context import add_zone_and_project
from ..core.kjob import (
AppProfileDefaults,
JobTemplateDefaults,
Kueue_TAS_annotation,
get_storage_annotations,
prepare_kjob,
)
from ..core.kueue import LOCAL_QUEUE_NAME
from ..utils.console import xpk_exit, xpk_print
from .common import set_cluster_command
from ..core.kjob import AppProfileDefaults, JobTemplateDefaults, prepare_kjob, Kueue_TAS_annotation, get_storage_annotations
from .kjob_common import add_gpu_networking_annotations_to_command
from .kind import set_local_cluster_command
import re
from .kjob_common import add_gpu_networking_annotations_to_command


def batch(args: Namespace) -> None:
Expand All @@ -38,12 +46,11 @@ def batch(args: Namespace) -> None:
"""
if not args.kind_cluster:
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
get_cluster_credentials(args)
else:
set_cluster_command_code = set_local_cluster_command(args)

if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)

err_code = prepare_kjob(args)
if err_code > 0:
Expand Down
27 changes: 22 additions & 5 deletions src/xpk/commands/cluster_gcluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,27 @@

import os

from ..core.remote_state.remote_state_client import RemoteStateClient
from ..core.remote_state.fuse_remote_state import FuseStateClient
from ..core.blueprint.blueprint_generator import (
BlueprintGenerator,
BlueprintGeneratorOutput,
a3mega_device_type,
a3ultra_device_type,
a4_device_type,
supported_device_types,
)
from ..core.commands import run_command_for_value
from ..core.capacity import get_capacity_type
from ..core.cluster import get_cluster_credentials
from ..core.commands import run_command_for_value
from ..core.docker_manager import DockerManager
from ..core.gcloud_context import zone_to_region
from ..core.gcluster_manager import GclusterManager
from ..core.kjob import apply_kjob_crds, prepare_kjob
from ..core.remote_state.fuse_remote_state import FuseStateClient
from ..core.remote_state.remote_state_client import RemoteStateClient
from ..utils.console import xpk_exit, xpk_print
from ..utils.file import ensure_directory_exists
from ..utils.network import all_IPs_cidr
from ..utils.objects import hash_string
from ..core.cluster import get_cluster_credentials
from ..core.kjob import apply_kjob_crds, prepare_kjob

blueprints_path = os.path.abspath('xpkclusters/blueprints')
gcluster_working_dir = os.path.abspath('xpkclusters/gcluster-out')
Expand Down Expand Up @@ -266,4 +267,20 @@ def generate_blueprint(
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
gcs_bucket=args.cluster_state_gcs_bucket,
)
if args.device_type == a4_device_type:
num_nodes = args.num_nodes if not args.num_nodes is None else 2
return bpg.generate_a4_blueprint(
blueprint_name=blueprint_name,
prefix=prefix,
cluster_name=args.cluster,
region=zone_to_region(args.zone),
project_id=args.project,
zone=args.zone,
auth_cidr=all_IPs_cidr,
num_nodes=num_nodes,
reservation=args.reservation if args.reservation else None,
capacity_type=capacity_type,
system_node_pool_machine_type=args.default_pool_cpu_machine_type,
system_node_pool_min_node_count=args.default_pool_cpu_num_nodes,
)
return None
6 changes: 2 additions & 4 deletions src/xpk/commands/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
from tabulate import tabulate

from ..core.commands import run_command_for_value
from ..core.cluster import get_cluster_credentials
from ..core.gcloud_context import add_zone_and_project
from ..core.kueue import verify_kueuectl
from ..utils.console import xpk_exit, xpk_print
from .common import set_cluster_command

table_fmt = 'plain'

Expand All @@ -37,9 +37,7 @@ def info(args: Namespace) -> None:
None
"""
add_zone_and_project(args)
set_cluster_command_code = set_cluster_command(args)
if set_cluster_command_code != 0:
xpk_exit(set_cluster_command_code)
get_cluster_credentials(args)

verify_kueuectl(args)
lq, cq = bool(args.localqueue), bool(args.clusterqueue)
Expand Down
Loading
Loading