Skip to content

Commit 3678f44

Browse files
committed
Alternate approach to solve aws#8482 based on ideas from moko-poi in PR aws#8547
1 parent 1c6fbc5 commit 3678f44

File tree

6 files changed

+55
-70
lines changed

6 files changed

+55
-70
lines changed

pkg/cache/suite_test.go

Lines changed: 3 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ import (
1919
"testing"
2020

2121
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
22-
"github.com/samber/lo"
2322
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
2423

2524
. "github.com/onsi/ginkgo/v2"
@@ -50,58 +49,17 @@ var _ = Describe("Cache", func() {
5049
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
5150

5251
// m5.large on-demand should return that it's unavailable when we mark it
53-
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)
54-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
55-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
56-
57-
// m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type
58-
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
59-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
60-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
61-
62-
// m5.xlarge spot should return that it's unavailable when we mark it
63-
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
64-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
65-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue())
66-
})
67-
It("should mark offerings as unavailable when calling MarkUnavailableForFleetErr", func() {
68-
// offerings should initially not be marked as unavailable
69-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeFalse())
70-
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
71-
72-
// m5.large on-demand should return that it's unavailable when we mark it
73-
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
74-
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
75-
Overrides: &ec2types.FleetLaunchTemplateOverrides{
76-
InstanceType: ec2types.InstanceTypeM5Large,
77-
AvailabilityZone: lo.ToPtr("test-zone-1a"),
78-
},
79-
},
80-
}, karpv1.CapacityTypeOnDemand)
52+
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)
8153
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
8254
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
8355

8456
// m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type
85-
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
86-
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
87-
Overrides: &ec2types.FleetLaunchTemplateOverrides{
88-
InstanceType: ec2types.InstanceTypeM5Large,
89-
AvailabilityZone: lo.ToPtr("test-zone-1b"),
90-
},
91-
},
92-
}, karpv1.CapacityTypeOnDemand)
57+
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
9358
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
9459
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())
9560

9661
// m5.xlarge spot should return that it's unavailable when we mark it
97-
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
98-
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
99-
Overrides: &ec2types.FleetLaunchTemplateOverrides{
100-
InstanceType: ec2types.InstanceTypeM5Xlarge,
101-
AvailabilityZone: lo.ToPtr("test-zone-1b"),
102-
},
103-
},
104-
}, karpv1.CapacityTypeSpot)
62+
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
10563
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
10664
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue())
10765
})

pkg/cache/unavailableofferings.go

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,9 @@ import (
2121
"sync"
2222
"sync/atomic"
2323

24-
"github.com/aws/aws-sdk-go-v2/aws"
2524
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
26-
"github.com/samber/lo"
2725

2826
"github.com/patrickmn/go-cache"
29-
"sigs.k8s.io/controller-runtime/pkg/log"
3027
)
3128

3229
// UnavailableOfferings stores any offerings that return ICE (insufficient capacity errors) when
@@ -90,26 +87,34 @@ func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType,
9087
}
9188

9289
// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings
93-
func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, unavailableReason string, instanceType ec2types.InstanceType, zone, capacityType string) {
90+
func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType ec2types.InstanceType, zone, capacityType string) {
9491
// even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL
95-
log.FromContext(ctx).WithValues(
96-
"reason", unavailableReason,
97-
"instance-type", instanceType,
98-
"zone", zone,
99-
"capacity-type", capacityType,
100-
"ttl", UnavailableOfferingsTTL).V(1).Info("removing offering from offerings")
92+
/*
93+
logValues := []interface{}{
94+
"instance-type", instanceType,
95+
"zone", zone,
96+
"capacity-type", capacityType,
97+
"ttl", UnavailableOfferingsTTL,
98+
}
99+
100+
// Add "reason" and "fleet-id" if provided
101+
unavailableKeys := []string{"reason", "fleet-id"}
102+
for _, key := range unavailableKeys {
103+
_, ok := unavailableReason[key]
104+
if ok {
105+
logValues = append(logValues, key, unavailableReason[key])
106+
}
107+
108+
}
109+
110+
log.FromContext(ctx).WithValues(logValues...).V(1).Info("removing offering from offerings")
111+
*/
101112
u.offeringCache.SetDefault(u.key(instanceType, zone, capacityType), struct{}{})
102113
u.offeringCacheSeqNumMu.Lock()
103114
u.offeringCacheSeqNum[instanceType]++
104115
u.offeringCacheSeqNumMu.Unlock()
105116
}
106117

107-
func (u *UnavailableOfferings) MarkUnavailableForFleetErr(ctx context.Context, fleetErr ec2types.CreateFleetError, capacityType string) {
108-
instanceType := fleetErr.LaunchTemplateAndOverrides.Overrides.InstanceType
109-
zone := aws.ToString(fleetErr.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
110-
u.MarkUnavailable(ctx, lo.FromPtr(fleetErr.ErrorCode), instanceType, zone, capacityType)
111-
}
112-
113118
func (u *UnavailableOfferings) MarkCapacityTypeUnavailable(capacityType string) {
114119
u.capacityTypeCache.SetDefault(capacityType, struct{}{})
115120
u.capacityTypeCacheSeqNum.Add(1)

pkg/controllers/interruption/controller.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,14 @@ func (c *Controller) handleNodeClaim(ctx context.Context, msg messages.Message,
220220
zone := nodeClaim.Labels[corev1.LabelTopologyZone]
221221
instanceType := nodeClaim.Labels[corev1.LabelInstanceTypeStable]
222222
if zone != "" && instanceType != "" {
223-
c.unavailableOfferingsCache.MarkUnavailable(ctx, string(msg.Kind()), ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot)
223+
log.FromContext(ctx).WithValues(
224+
"reason", string(msg.Kind()),
225+
"instance-type", instanceType,
226+
"zone", zone,
227+
"capacity-type", karpv1.CapacityTypeSpot,
228+
"ttl", cache.UnavailableOfferingsTTL,
229+
).V(1).Info("removing offering from offerings")
230+
c.unavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot)
224231
}
225232
}
226233
if action != NoAction {

pkg/providers/amifamily/bootstrap/bottlerocketsettings.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ type BottlerocketKubernetes struct {
8686
SingleProcessOOMKill *bool `toml:"single-process-oom-kill,omitempty"`
8787
ContainerLogMaxWorkers *int `toml:"container-log-max-workers,omitempty"`
8888
ContainerLogMonitorInterval *string `toml:"container-log-monitor-interval,omitempty"`
89+
HostnameOverrideSource *string `toml:"hostname-override-source,omitempty"`
90+
VerbosityLevel *uint32 `toml:"log-level,omitempty"`
8991
}
9092
type BottlerocketStaticPod struct {
9193
Enabled *bool `toml:"enabled,omitempty"`

pkg/providers/instance/instance.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ func (p *DefaultProvider) launchInstance(
350350
}
351351
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w", err), reason, fmt.Sprintf("Error creating fleet request: %s", message))
352352
}
353-
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes)
353+
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes, aws.ToString(createFleetOutput.FleetId))
354354
if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 {
355355
requestID, _ := awsmiddleware.GetRequestIDMetadata(createFleetOutput.ResultMetadata)
356356
return ec2types.CreateFleetInstance{}, serrors.Wrap(
@@ -481,6 +481,7 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache(
481481
capacityType string,
482482
nodeClaim *karpv1.NodeClaim,
483483
instanceTypes []*cloudprovider.InstanceType,
484+
fleetID string,
484485
) {
485486
for _, err := range errs {
486487
zone := lo.FromPtr(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
@@ -492,10 +493,22 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache(
492493
if capacityType != karpv1.CapacityTypeReserved {
493494
for _, err := range errs {
494495
if awserrors.IsUnfulfillableCapacity(err) {
495-
p.unavailableOfferings.MarkUnavailableForFleetErr(ctx, err, capacityType)
496+
instanceType := err.LaunchTemplateAndOverrides.Overrides.InstanceType
497+
zone := aws.ToString(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
498+
reason := lo.FromPtr(err.ErrorCode)
499+
log.FromContext(ctx).WithValues(
500+
"reason", reason,
501+
"instance-type", instanceType,
502+
"zone", zone,
503+
"capacity-type", karpv1.CapacityTypeSpot,
504+
"ttl", awscache.UnavailableOfferingsTTL,
505+
"fleet-id", fleetID,
506+
).V(1).Info("removing offering from offerings")
507+
p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType)
496508
}
497509
if awserrors.IsServiceLinkedRoleCreationNotPermitted(err) {
498510
p.unavailableOfferings.MarkCapacityTypeUnavailable(karpv1.CapacityTypeSpot)
511+
499512
p.recorder.Publish(SpotServiceLinkedRoleCreationFailure(nodeClaim))
500513
}
501514
}

pkg/providers/instancetype/suite_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2279,8 +2279,8 @@ var _ = Describe("InstanceTypeProvider", func() {
22792279
Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6))
22802280

22812281
// Mark spot m5.xlarge instance as unavailable in a few zones, nothing should change
2282-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot)
2283-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
2282+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot)
2283+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
22842284
Expect(err).ToNot(HaveOccurred())
22852285
m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool {
22862286
return it.Name == string(ec2types.InstanceTypeM5Large)
@@ -2289,7 +2289,7 @@ var _ = Describe("InstanceTypeProvider", func() {
22892289
Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6))
22902290

22912291
// Mark spot m5.large instance in test-zone-1a as unavailable
2292-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot)
2292+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot)
22932293
instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
22942294
Expect(err).ToNot(HaveOccurred())
22952295
m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool {
@@ -2303,8 +2303,8 @@ var _ = Describe("InstanceTypeProvider", func() {
23032303
}))[0].Available).To(BeFalse())
23042304

23052305
// Mark on-demand m5.large instance in test-zone-1b and test-zone-1c as unavailable
2306-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
2307-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand)
2306+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
2307+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand)
23082308

23092309
instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
23102310
Expect(err).ToNot(HaveOccurred())
@@ -2795,7 +2795,7 @@ var _ = Describe("InstanceTypeProvider", func() {
27952795
list1, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
27962796
Expect(err).ToNot(HaveOccurred())
27972797

2798-
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot)
2798+
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot)
27992799
list2, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
28002800
Expect(err).ToNot(HaveOccurred())
28012801

0 commit comments

Comments
 (0)