Skip to content

Commit

Permalink
Fix creation of aw/job with same name in different namespaces
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianZaccaria committed Nov 14, 2023
1 parent bbb6871 commit 0693e5d
Show file tree
Hide file tree
Showing 35 changed files with 168 additions and 142 deletions.
2 changes: 1 addition & 1 deletion doc/usage/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ Delete the first `AppWrapper` job.

```bash
$ kubectl delete -f aw-01.yaml
appwrapper.workload.codeflare.dev "0001-aw-generic-deployment-1" deleted
workload.codeflare.dev/appwrapper "0001-aw-generic-deployment-1" deleted
```

Check the pods status of the `AppWrapper` jobs. The new pods from the second `AppWrapper` job: `0002-aw-generic-deployment-2` job should now be deployed and running.
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ require (
k8s.io/client-go v0.26.2
k8s.io/klog/v2 v2.90.1
k8s.io/metrics v0.26.2
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
sigs.k8s.io/custom-metrics-apiserver v0.0.0
sigs.k8s.io/structured-merge-diff/v4 v4.2.3
)

replace sigs.k8s.io/custom-metrics-apiserver => sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3
Expand Down Expand Up @@ -107,9 +109,7 @@ require (
k8s.io/component-base v0.26.2 // indirect
k8s.io/kms v0.26.2 // indirect
k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d // indirect
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
)
2 changes: 1 addition & 1 deletion pkg/apis/controller/v1beta1/appwrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const AppWrapperPlural string = "appwrappers"

// AppWrapperAnnotationKey is the annotation key of Pod to identify
// which AppWrapper it belongs to.
const AppWrapperAnnotationKey = "appwrapper.mcad.ibm.com/appwrapper-name"
const AppWrapperAnnotationKey = "workload.codeflare.dev/appwrapper-name"

// +genclient
// +kubebuilder:object:root=true
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/queuejob/queuejob_controller_ex.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func GetQueueJobKey(obj interface{}) (string, error) {
// UpdateQueueJobStatus was part of pod informer, this is now a method of queuejob_controller file.
// This change is done in an effort to simplify the controller and enable to move to controller runtime.
func (qjm *XController) UpdateQueueJobStatus(queuejob *arbv1.AppWrapper) error {
labelSelector := fmt.Sprintf("%s=%s", "appwrapper.mcad.ibm.com", queuejob.Name)
labelSelector := fmt.Sprintf("%s=%s", "workload.codeflare.dev/appwrapper", queuejob.Name)
pods, errt := qjm.clients.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{LabelSelector: labelSelector})
if errt != nil {
return errt
Expand Down Expand Up @@ -208,7 +208,7 @@ func (qjm *XController) allocatableCapacity() *clusterstateapi.Resource {
klog.Errorf("[allocatableCapacity] Error listing pods %v", err)
}
for _, pod := range podList.Items {
if _, ok := pod.GetLabels()["appwrappers.mcad.ibm.com"]; !ok && pod.Status.Phase != v1.PodFailed && pod.Status.Phase != v1.PodSucceeded {
if _, ok := pod.GetLabels()["workload.codeflare.dev/appwrappers"]; !ok && pod.Status.Phase != v1.PodFailed && pod.Status.Phase != v1.PodSucceeded {
for _, container := range pod.Spec.Containers {
usedResource := clusterstateapi.NewResource(container.Resources.Requests)
capacity.Sub(usedResource)
Expand Down
55 changes: 36 additions & 19 deletions pkg/controller/queuejobresources/genericresource/genericresource.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"fmt"
"math"
"math/rand"
"runtime/debug"
"strings"
"time"
Expand All @@ -43,8 +44,9 @@ import (
"k8s.io/client-go/restmapper"
)

var appwrapperJobName = "appwrapper.mcad.ibm.com"
var resourceName = "resourceName"
var appwrapperJobLabelName = "workload.codeflare.dev/appwrapper"
var appwrapperJobLabelNamespace = "workload.codeflare.dev/appwrapper-namespace"
var resourceName = "workload.codeflare.dev/resourceName"
var appWrapperKind = arbv1.SchemeGroupVersion.WithKind("AppWrapper")

type GenericResources struct {
Expand Down Expand Up @@ -72,6 +74,18 @@ func join(strs ...string) string {
return result
}


func GetRandomString(n int) string {
var letters = []rune("abcdefghijklmnopqrstuvwxyz0123456789")

rand.Seed(time.Now().UnixNano())
b := make([]rune, n)
for i := range b {
b[i] = letters[rand.Intn(len(letters))]
}
return string(b)
}

func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperGenericResource) (genericResourceName string, groupversionkind *schema.GroupVersionKind, erro error) {
var err error
err = nil
Expand Down Expand Up @@ -166,7 +180,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
}

// Get the resource to see if it exists in the AppWrapper namespace
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobName, aw.Name, resourceName, unstruct.GetName())
labelSelector := fmt.Sprintf("%s=%s, %s=%s, %s=%s", appwrapperJobLabelName, aw.Name, appwrapperJobLabelNamespace, aw.Namespace, resourceName, unstruct.GetName())
inEtcd, err := dclient.Resource(rsrc).Namespace(aw.Namespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return name, gvk, err
Expand All @@ -175,9 +189,10 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
// Check to see if object already exists in etcd, if not, create the object.
if inEtcd != nil || len(inEtcd.Items) > 0 {
newName := name
if len(newName) > 63 {
newName = newName[:63]
if len(newName) > 60 {
newName = newName[:60]
}
newName += GetRandomString(3)

err = deleteObject(namespaced, namespace, newName, rsrc, dclient)
if err != nil {
Expand All @@ -187,7 +202,7 @@ func (gr *GenericResources) Cleanup(aw *arbv1.AppWrapper, awr *arbv1.AppWrapperG
return name, gvk, err
}
} else {
klog.Warningf("[Cleanup] %s/%s not found using label selector: %s.\n", name, namespace, labelSelector)
klog.Warningf("[Cleanup] %s/%s not found using label selector: %s.\n", namespace, name, labelSelector)
}

return name, gvk, err
Expand Down Expand Up @@ -297,18 +312,19 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
} else {
labels = unstruct.GetLabels()
}
labels[appwrapperJobName] = aw.Name
labels[appwrapperJobLabelName] = aw.Name
labels[appwrapperJobLabelNamespace] = aw.Namespace
labels[resourceName] = unstruct.GetName()
unstruct.SetLabels(labels)

// Add labels to pod template if one exists.
podTemplateFound := addLabelsToPodTemplateField(&unstruct, labels)
if !podTemplateFound {
klog.V(4).Infof("[SyncQueueJob] No pod template spec exists for resource: %s to add labels.", name)
klog.V(4).Infof("[SyncQueueJob] No pod template spec exists for resource: %s/%s to add labels.", namespace, name)
}

// Get the resource to see if it exists
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobName, aw.Name, resourceName, unstruct.GetName())
// Get the resource to see if it exists
labelSelector := fmt.Sprintf("%s=%s, %s=%s, %s=%s", appwrapperJobLabelName, aw.Name, appwrapperJobLabelNamespace, aw.Namespace, resourceName, unstruct.GetName())
inEtcd, err := dclient.Resource(rsrc).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return []*v1.Pod{}, err
Expand All @@ -317,9 +333,10 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
// Check to see if object already exists in etcd, if not, create the object.
if inEtcd == nil || len(inEtcd.Items) < 1 {
newName := name
if len(newName) > 63 {
newName = newName[:63]
if len(newName) > 60 {
newName = newName[:60]
}
newName += GetRandomString(3)
unstruct.SetName(newName)
//Asumption object is always namespaced
//Refer to comment on line 238
Expand All @@ -329,7 +346,7 @@ func (gr *GenericResources) SyncQueueJob(aw *arbv1.AppWrapper, awr *arbv1.AppWra
if errors.IsAlreadyExists(err) {
klog.V(4).Infof("%v\n", err.Error())
} else {
klog.Errorf("Error creating the object `%v`, the error is `%v`", newName, errors.ReasonForError(err))
klog.Errorf("Error creating the object `%s/%s`, the error is `%v`", namespace, newName, errors.ReasonForError(err))
return []*v1.Pod{}, err
}
}
Expand Down Expand Up @@ -499,7 +516,7 @@ func deleteObject(namespaced bool, namespace string, name string, rsrc schema.Gr
}

if err != nil && !errors.IsNotFound(err) {
klog.Errorf("[deleteObject] Error deleting the object `%v`, the error is `%v`.", name, errors.ReasonForError(err))
klog.Errorf("[deleteObject] Error deleting the object `%v`, in namespace %v, the error is `%v`.", name, namespace, errors.ReasonForError(err))
return err
} else {
klog.V(4).Infof("[deleteObject] Resource `%v` deleted.\n", name)
Expand Down Expand Up @@ -531,7 +548,7 @@ func GetListOfPodResourcesFromOneGenericItem(awr *arbv1.AppWrapperGenericResourc
klog.V(8).Infof("[GetListOfPodResourcesFromOneGenericItem] Requested total allocation resource from 1 pod `%v`.\n", podTotalresource)
}

// Addd individual pods to results
// Add individual pods to results
var replicaCount int = int(replicas)
for i := 0; i < replicaCount; i++ {
podResourcesList = append(podResourcesList, podTotalresource)
Expand Down Expand Up @@ -623,7 +640,7 @@ func getContainerResources(container v1.Container, replicas float64) *clustersta
}

// returns status of an item present in etcd
func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, namespace string, appwrapperName string, genericItemName string) (completed bool) {
func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResource, appwrapperNamespace string, appwrapperName string, genericItemName string) (completed bool) {
dd := gr.clients.Discovery()
apigroups, err := restmapper.GetAPIGroupResources(dd)
if err != nil {
Expand Down Expand Up @@ -654,8 +671,8 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc
return false
}

labelSelector := fmt.Sprintf("%s=%s", appwrapperJobName, appwrapperName)
inEtcd, err := dclient.Resource(rsrc).Namespace(namespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
labelSelector := fmt.Sprintf("%s=%s, %s=%s", appwrapperJobLabelName, appwrapperName, appwrapperJobLabelNamespace, appwrapperNamespace)
inEtcd, err := dclient.Resource(rsrc).Namespace(appwrapperNamespace).List(context.Background(), metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
klog.Errorf("[IsItemCompleted] Error listing object: %v", err)
return false
Expand All @@ -675,7 +692,7 @@ func (gr *GenericResources) IsItemCompleted(awgr *arbv1.AppWrapperGenericResourc
}
}
if !validAwOwnerRef {
klog.Warningf("[IsItemCompleted] Item owner name %v does match appwrappper name %v in namespace %v", unstructuredObjectName, appwrapperName, namespace)
klog.Warningf("[IsItemCompleted] Item owner name %v does match appwrappper name %v in namespace %v", unstructuredObjectName, appwrapperName, appwrapperNamespace)
continue
}

Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-borrowing/steps/03-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: my-job-1
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-1
workload.codeflare.dev/appwrapper: my-job-1
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: my-job-1
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-1
workload.codeflare.dev/appwrapper: my-job-1
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-borrowing/steps/04-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: my-job-2
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-2
workload.codeflare.dev/appwrapper: my-job-2
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: my-job-2
namespace: test
labels:
appwrapper.mcad.ibm.com: my-job-2
workload.codeflare.dev/appwrapper: my-job-2
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
5 changes: 3 additions & 2 deletions test/e2e-kuttl-deployment-01/steps/01-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ metadata:
namespace: start-up
labels:
app: no-quota-deployment-01
appwrapper.mcad.ibm.com: no-quota-deployment-01
resourceName: no-quota-deployment-01
workload.codeflare.dev/appwrapper: no-quota-deployment-01
workload.codeflare.dev/appwrapper-namespace: start-up
workload.codeflare.dev/resourceName: no-quota-deployment-01
status:
availableReplicas: 1
observedGeneration: 1
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-deployment-01/steps/02-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ spec:
name: no-quota-job-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-02
workload.codeflare.dev/appwrapper: no-quota-job-02
spec:
parallelism: 1
completions: 1
Expand All @@ -36,7 +36,7 @@ spec:
name: no-quota-job-1
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-02
workload.codeflare.dev/appwrapper: no-quota-job-02
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
35 changes: 19 additions & 16 deletions test/e2e-kuttl-deployment-01/steps/03-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,31 @@ metadata:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
resourceName: hold-completion-job-03-01
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
workload.codeflare.dev/resourceName: hold-completion-job-03-01
status:
conditions:
- status: "True"
type: Complete
succeeded: 1
---
apiVersion: v1
kind: Pod
metadata:
apiVersion: v1
kind: Pod
metadata:
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
job-name: hold-completion-job-03-01
resourceName: hold-completion-job-03-01
labels:
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
job-name: hold-completion-job-03-01
workload.codeflare.dev/resourceName: hold-completion-job-03-01
---
apiVersion: v1
kind: Pod
metadata:
apiVersion: v1
kind: Pod
metadata:
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
job-name: hold-completion-job-03-02
resourceName: hold-completion-job-03-02
labels:
workload.codeflare.dev/appwrapper: hold-completion-job-03
workload.codeflare.dev/appwrapper-namespace: start-up
job-name: hold-completion-job-03-02
workload.codeflare.dev/resourceName: hold-completion-job-03-02
8 changes: 4 additions & 4 deletions test/e2e-kuttl-deployment-01/steps/03-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: hold-completion-job-03-01
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down Expand Up @@ -80,7 +80,7 @@ spec:
name: hold-completion-job-03-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
parallelism: 1
completions: 1
Expand All @@ -89,7 +89,7 @@ spec:
name: hold-completion-job-03-02
namespace: start-up
labels:
appwrapper.mcad.ibm.com: hold-completion-job-03
workload.codeflare.dev/appwrapper: hold-completion-job-03
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
4 changes: 2 additions & 2 deletions test/e2e-kuttl-deployment-01/steps/06-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
name: no-quota-job-06
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-06
workload.codeflare.dev/appwrapper: no-quota-job-06
spec:
parallelism: 1
completions: 1
Expand All @@ -40,7 +40,7 @@ spec:
name: no-quota-job-06
namespace: start-up
labels:
appwrapper.mcad.ibm.com: no-quota-job-06
workload.codeflare.dev/appwrapper: no-quota-job-06
spec:
terminationGracePeriodSeconds: 1
restartPolicy: Never
Expand Down
Loading

0 comments on commit 0693e5d

Please sign in to comment.