Skip to content

Commit 18b91f7

Browse files
authored
Merge pull request #4 from openshift-psap/yaml-gen
Added YAML Generation Script/Template
2 parents 12c2327 + e328cc9 commit 18b91f7

File tree

2 files changed

+329
-0
lines changed

2 files changed

+329
-0
lines changed
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: aw-kuberay-glue
5+
namespace: default
6+
spec:
7+
priority: 9
8+
resources:
9+
Items: []
10+
GenericItems:
11+
- replicas: 1
12+
custompodresources:
13+
- replicas: 4
14+
requests:
15+
cpu: 2
16+
memory: 12G
17+
nvidia.com/gpu: 1
18+
limits:
19+
cpu: 2
20+
memory: 12G
21+
nvidia.com/gpu: 1
22+
generictemplate:
23+
# This config demonstrates KubeRay's Ray autoscaler integration.
24+
# The resource requests and limits in this config are too small for production!
25+
# For an example with more realistic resource configuration, see
26+
# ray-cluster.autoscaler.large.yaml.
27+
apiVersion: ray.io/v1alpha1
28+
kind: RayCluster
29+
metadata:
30+
labels:
31+
appwrapper.mcad.ibm.com: "aw-kuberay-glue"
32+
controller-tools.k8s.io: "1.0"
33+
# A unique identifier for the head node and workers of this cluster.
34+
name: glue-cluster
35+
# finalizers:
36+
# - kubernetes
37+
spec:
38+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
39+
rayVersion: '1.12.0'
40+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
41+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
42+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
43+
enableInTreeAutoscaling: false
44+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
45+
# The example configuration shown below below represents the DEFAULT values.
46+
# (You may delete autoscalerOptions if the defaults are suitable.)
47+
autoscalerOptions:
48+
# upscalingMode is "Default" or "Aggressive."
49+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
50+
# Default: Upscaling is not rate-limited.
51+
# Aggressive: An alias for Default; upscaling is not rate-limited.
52+
upscalingMode: Default
53+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
54+
idleTimeoutSeconds: 60
55+
# image optionally overrides the autoscaler's container image.
56+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
57+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
58+
## image: "my-repo/my-custom-autoscaler-image:tag"
59+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
60+
imagePullPolicy: Always
61+
# resources specifies optional resource request and limit overrides for the autoscaler container.
62+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
63+
resources:
64+
limits:
65+
cpu: "500m"
66+
memory: "512Mi"
67+
requests:
68+
cpu: "500m"
69+
memory: "512Mi"
70+
######################headGroupSpec#################################
71+
# head group template and specs, (perhaps 'group' is not needed in the name)
72+
headGroupSpec:
73+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
74+
serviceType: ClusterIP
75+
# logical group name, for this called head-group, also can be functional
76+
# pod type head or worker
77+
# rayNodeType: head # Not needed since it is under the headgroup
78+
# the following params are used to complete the ray start: ray start --head --block ...
79+
rayStartParams:
80+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
81+
dashboard-host: '0.0.0.0'
82+
block: 'true'
83+
# num-cpus: '1' # can be auto-completed from the limits
84+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
85+
# The value of `resources` is a string-integer mapping.
86+
# Currently, `resources` must be provided in the specific format demonstrated below:
87+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
88+
num-gpus: '0'
89+
#pod template
90+
template:
91+
spec:
92+
affinity:
93+
nodeAffinity:
94+
requiredDuringSchedulingIgnoredDuringExecution:
95+
nodeSelectorTerms:
96+
- matchExpressions:
97+
- key: role
98+
operator: In
99+
values:
100+
- "aw-kuberay-glue"
101+
containers:
102+
# The Ray head pod
103+
- name: ray-head
104+
image: asm582/codeflare-tl-aws:latest
105+
env:
106+
- name: AWS_ACCESS_KEY_ID
107+
valueFrom:
108+
secretKeyRef:
109+
name: glue-s3-creds
110+
key: AWS_ACCESS_KEY_ID
111+
- name: AWS_SECRET_ACCESS_KEY
112+
valueFrom:
113+
secretKeyRef:
114+
name: glue-s3-creds
115+
key: AWS_SECRET_ACCESS_KEY
116+
- name: ENDPOINT_URL
117+
valueFrom:
118+
secretKeyRef:
119+
name: glue-s3-creds
120+
key: ENDPOINT_URL
121+
imagePullPolicy: Always
122+
ports:
123+
- containerPort: 6379
124+
name: gcs
125+
- containerPort: 8265
126+
name: dashboard
127+
- containerPort: 10001
128+
name: client
129+
lifecycle:
130+
preStop:
131+
exec:
132+
command: ["/bin/sh","-c","ray stop"]
133+
resources:
134+
limits:
135+
cpu: "2"
136+
memory: "12G"
137+
nvidia.com/gpu: "0"
138+
requests:
139+
cpu: "2"
140+
memory: "12G"
141+
nvidia.com/gpu: "0"
142+
workerGroupSpecs:
143+
# the pod replicas in this group typed worker
144+
- replicas: 3
145+
minReplicas: 3
146+
maxReplicas: 3
147+
# logical group name, for this called small-group, also can be functional
148+
groupName: small-group
149+
# if worker pods need to be added, we can simply increment the replicas
150+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
151+
# the operator will remove pods from the list until the number of replicas is satisfied
152+
# when a pod is confirmed to be deleted, its name will be removed from the list below
153+
#scaleStrategy:
154+
# workersToDelete:
155+
# - raycluster-complete-worker-small-group-bdtwh
156+
# - raycluster-complete-worker-small-group-hv457
157+
# - raycluster-complete-worker-small-group-k8tj7
158+
# the following params are used to complete the ray start: ray start --block ...
159+
rayStartParams:
160+
block: 'true'
161+
num-gpus: '1'
162+
#pod template
163+
template:
164+
metadata:
165+
labels:
166+
key: value
167+
# annotations for pod
168+
annotations:
169+
key: value
170+
# finalizers:
171+
# - kubernetes
172+
spec:
173+
affinity:
174+
nodeAffinity:
175+
requiredDuringSchedulingIgnoredDuringExecution:
176+
nodeSelectorTerms:
177+
- matchExpressions:
178+
- key: role
179+
operator: In
180+
values:
181+
- "aw-kuberay-glue"
182+
initContainers:
183+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
184+
- name: init-myservice
185+
image: busybox:1.28
186+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
187+
containers:
188+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
189+
image: asm582/codeflare-tl-aws:latest
190+
env:
191+
- name: AWS_ACCESS_KEY_ID
192+
valueFrom:
193+
secretKeyRef:
194+
name: glue-s3-creds
195+
key: AWS_ACCESS_KEY_ID
196+
- name: AWS_SECRET_ACCESS_KEY
197+
valueFrom:
198+
secretKeyRef:
199+
name: glue-s3-creds
200+
key: AWS_SECRET_ACCESS_KEY
201+
- name: ENDPOINT_URL
202+
valueFrom:
203+
secretKeyRef:
204+
name: glue-s3-creds
205+
key: ENDPOINT_URL
206+
# environment variables to set in the container.Optional.
207+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
208+
lifecycle:
209+
preStop:
210+
exec:
211+
command: ["/bin/sh","-c","ray stop"]
212+
resources:
213+
limits:
214+
cpu: "2"
215+
memory: "12G"
216+
nvidia.com/gpu: "1"
217+
requests:
218+
cpu: "2"
219+
memory: "12G"
220+
nvidia.com/gpu: "1"
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import yaml
2+
import sys
3+
import argparse
4+
import uuid
5+
6+
def readTemplate(template):
7+
with open(template, "r") as stream:
8+
try:
9+
return yaml.safe_load(stream)
10+
except yaml.YAMLError as exc:
11+
print(exc)
12+
13+
def gen_names():
14+
gen_id = str(uuid.uuid4())
15+
appwrapper_name = "appwrapper-" + gen_id
16+
cluster_name = "cluster-" + gen_id
17+
return appwrapper_name, cluster_name
18+
19+
def update_names(yaml, item, appwrapper_name, cluster_name):
20+
metadata = yaml.get("metadata")
21+
metadata["name"] = appwrapper_name
22+
lower_meta = item.get("generictemplate", {}).get("metadata")
23+
lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name
24+
lower_meta["name"] = cluster_name
25+
26+
def updateCustompodresources(item, cpu, memory, gpu, workers):
27+
if 'custompodresources' in item.keys():
28+
custompodresources = item.get('custompodresources')
29+
for resource in custompodresources:
30+
for k,v in resource.items():
31+
if k == "replicas":
32+
resource[k] = workers
33+
if k == "requests" or k == "limits":
34+
for spec,_ in v.items():
35+
if spec == "cpu":
36+
resource[k][spec] = cpu
37+
if spec == "memory":
38+
resource[k][spec] = str(memory) + "G"
39+
if spec == "nvidia.com/gpu":
40+
resource[k][spec] = gpu
41+
else:
42+
sys.exit("Error: malformed template")
43+
44+
def update_affinity(spec, appwrapper_name):
45+
node_selector_terms = spec.get("affinity").get("nodeAffinity").get("requiredDuringSchedulingIgnoredDuringExecution").get("nodeSelectorTerms")
46+
node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
47+
48+
def update_resources(spec, cpu, memory, gpu):
49+
container = spec.get("containers")
50+
for resource in container:
51+
requests = resource.get('resources').get('requests')
52+
if requests is not None:
53+
requests["cpu"] = cpu
54+
requests["memory"] = str(memory) + "G"
55+
requests["nvidia.com/gpu"] = gpu
56+
limits = resource.get('resources').get('limits')
57+
if limits is not None:
58+
limits["cpu"] = cpu
59+
limits["memory"] = str(memory) + "G"
60+
limits["nvidia.com/gpu"] = gpu
61+
62+
def update_nodes(item, appwrapper_name, cpu, memory, gpu, workers):
63+
if "generictemplate" in item.keys():
64+
head = item.get("generictemplate").get("spec").get("headGroupSpec")
65+
worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
66+
67+
# Head counts as first worker
68+
worker["replicas"] = workers - 1
69+
worker["minReplicas"] = workers - 1
70+
worker["maxReplicas"] = workers - 1
71+
72+
for comp in [head, worker]:
73+
spec = comp.get("template").get("spec")
74+
update_affinity(spec, appwrapper_name)
75+
update_resources(spec, cpu, memory, gpu)
76+
77+
def generateAppwrapper(cpu, memory, gpu, workers, template):
78+
user_yaml = readTemplate(template)
79+
appwrapper_name, cluster_name = gen_names()
80+
resources = user_yaml.get("spec","resources")
81+
item = resources["resources"].get("GenericItems")[0]
82+
update_names(user_yaml, item, appwrapper_name, cluster_name)
83+
updateCustompodresources(item, cpu, memory, gpu, workers)
84+
update_nodes(item, appwrapper_name, cpu, memory, gpu, workers)
85+
writeUserAppwrapper(user_yaml, appwrapper_name)
86+
87+
def writeUserAppwrapper(user_yaml, appwrapper_name):
88+
with open(f'{appwrapper_name}.yaml','w') as outfile:
89+
yaml.dump(user_yaml, outfile, default_flow_style=False)
90+
91+
def main():
92+
parser = argparse.ArgumentParser(description='Generate user AppWrapper')
93+
parser.add_argument("--cpu", type=int, required=True, help="number of CPU(s) in a worker required for running job")
94+
parser.add_argument("--memory", required=True, help="RAM required in a worker for running job")
95+
parser.add_argument("--gpu",type=int, required=True, help="GPU(s) required in a worker for running job")
96+
parser.add_argument("--workers", type=int, required=True, help="How many workers are required in the cluster")
97+
parser.add_argument("--template", required=True, help="Template AppWrapper yaml file")
98+
99+
args = parser.parse_args()
100+
cpu = args.cpu
101+
memory = args.memory
102+
gpu = args.gpu
103+
workers = args.workers
104+
template = args.template
105+
106+
generateAppwrapper(cpu, memory, gpu, workers, template)
107+
108+
if __name__=="__main__":
109+
main()

0 commit comments

Comments
 (0)