Skip to content

Commit

Permalink
fix: Ignore pods awaiting garbage collection during topology calculat… (
Browse files Browse the repository at this point in the history
  • Loading branch information
ellistarn authored Oct 27, 2023
1 parent fb39cb4 commit 14dda61
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 4 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Save tag as an environment variable
run: |
tag=$(git describe --tags --exact-match)
echo "TAG=$tag" >> $GITHUB_ENV
- name: Create Github Release
uses: "marvinpinto/action-automatic-releases@latest"
with:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@ var _ = Describe("Topology", func() {
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(2))
})

It("should ignore pods if node does not exist", func() {
topology := []v1.TopologySpreadConstraint{{
TopologyKey: v1.LabelTopologyZone,
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: labels},
MaxSkew: 1,
}}
podAwaitingGC := test.Pod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology, NodeName: "does-not-exist"})
ExpectApplied(ctx, env.Client, provisioner, podAwaitingGC)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov,
test.UnschedulablePods(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Labels: labels}, TopologySpreadConstraints: topology}, 4)...,
)
ExpectSkew(ctx, env.Client, "default", &topology[0]).To(ConsistOf(1, 1, 2))
})

Context("Zonal", func() {
It("should balance pods across zones (match labels)", func() {
topology := []v1.TopologySpreadConstraint{{
Expand Down
12 changes: 12 additions & 0 deletions pkg/controllers/provisioning/scheduling/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"github.com/aws/karpenter-core/pkg/scheduling"
"github.com/aws/karpenter-core/pkg/utils/functional"

"k8s.io/apimachinery/pkg/api/errors"

"go.uber.org/multierr"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -229,6 +231,8 @@ func (t *Topology) updateInverseAntiAffinity(ctx context.Context, pod *v1.Pod, d

// countDomains initializes the topology group by registereding any well known domains and performing pod counts
// against the cluster for any existing pods.
//
//nolint:gocyclo
func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
podList := &v1.PodList{}

Expand All @@ -252,6 +256,14 @@ func (t *Topology) countDomains(ctx context.Context, tg *TopologyGroup) error {
}
node := &v1.Node{}
if err := t.kubeClient.Get(ctx, types.NamespacedName{Name: p.Spec.NodeName}, node); err != nil {
// Pods that cannot be evicted can be leaked in the API Server after
// a Node is removed. Since pod bindings are immutable, these pods
// cannot be recovered, and will be deleted by the pod lifecycle
// garbage collector. These pods are not running, and should not
// impact future topology calculations.
if errors.IsNotFound(err) {
continue
}
return fmt.Errorf("getting node %s, %w", p.Spec.NodeName, err)
}
domain, ok := node.Labels[tg.Key]
Expand Down

0 comments on commit 14dda61

Please sign in to comment.