Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 15 additions & 31 deletions pkg/cache/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"testing"

ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/samber/lo"
karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -50,58 +49,43 @@ var _ = Describe("Cache", func() {
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.large on-demand should return that it's unavailable when we mark it
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.xlarge spot should return that it's unavailable when we mark it
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue())
})
It("should mark offerings as unavailable when calling MarkUnavailableForFleetErr", func() {
It("should mark offerings as unavailable with fleet error reasons", func() {
// offerings should initially not be marked as unavailable
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeFalse())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.large on-demand should return that it's unavailable when we mark it
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
Overrides: &ec2types.FleetLaunchTemplateOverrides{
InstanceType: ec2types.InstanceTypeM5Large,
AvailabilityZone: lo.ToPtr("test-zone-1a"),
},
},
}, karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{
"reason": "InsufficientInstanceCapacity",
})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
Overrides: &ec2types.FleetLaunchTemplateOverrides{
InstanceType: ec2types.InstanceTypeM5Large,
AvailabilityZone: lo.ToPtr("test-zone-1b"),
},
},
}, karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{
"reason": "InsufficientInstanceCapacity",
})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse())

// m5.xlarge spot should return that it's unavailable when we mark it
unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{
LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{
Overrides: &ec2types.FleetLaunchTemplateOverrides{
InstanceType: ec2types.InstanceTypeM5Xlarge,
AvailabilityZone: lo.ToPtr("test-zone-1b"),
},
},
}, karpv1.CapacityTypeSpot)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{
"reason": "InsufficientInstanceCapacity",
})
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue())
Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue())
})
Expand Down Expand Up @@ -146,12 +130,12 @@ var _ = Describe("Cache", func() {
Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 0))

// marking m5.large as unavailable should increase the sequence number for that instance type but not others
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})
Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Large)).To(BeNumerically("==", 1))
Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 0))

// marking m5.xlarge as unavailable should increase the sequence number for that instance type but not others
unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeOnDemand)
unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})
Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Large)).To(BeNumerically("==", 1))
Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 1))

Expand Down
26 changes: 13 additions & 13 deletions pkg/cache/unavailableofferings.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@ import (
"sync"
"sync/atomic"

"github.com/aws/aws-sdk-go-v2/aws"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/samber/lo"
"sigs.k8s.io/controller-runtime/pkg/log"

"github.com/patrickmn/go-cache"
"sigs.k8s.io/controller-runtime/pkg/log"
)

// UnavailableOfferings stores any offerings that return ICE (insufficient capacity errors) when
Expand Down Expand Up @@ -90,26 +88,28 @@ func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType,
}

// MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings
func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, unavailableReason string, instanceType ec2types.InstanceType, zone, capacityType string) {
func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType ec2types.InstanceType, zone, capacityType string, unavailableReason map[string]string) {
// even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL
log.FromContext(ctx).WithValues(
"reason", unavailableReason,
logValues := []interface{}{
"reason", unavailableReason["reason"],
"instance-type", instanceType,
"zone", zone,
"capacity-type", capacityType,
"ttl", UnavailableOfferingsTTL).V(1).Info("removing offering from offerings")
"ttl", UnavailableOfferingsTTL,
}
// Add fleetID if provided
key := "fleet-id"
_, ok := unavailableReason[key]
if ok {
logValues = append(logValues, key, unavailableReason[key])
}
log.FromContext(ctx).WithValues(logValues...).V(1).Info("removing offering from offerings")
u.offeringCache.SetDefault(u.key(instanceType, zone, capacityType), struct{}{})
u.offeringCacheSeqNumMu.Lock()
u.offeringCacheSeqNum[instanceType]++
u.offeringCacheSeqNumMu.Unlock()
}

func (u *UnavailableOfferings) MarkUnavailableForFleetErr(ctx context.Context, fleetErr ec2types.CreateFleetError, capacityType string) {
instanceType := fleetErr.LaunchTemplateAndOverrides.Overrides.InstanceType
zone := aws.ToString(fleetErr.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
u.MarkUnavailable(ctx, lo.FromPtr(fleetErr.ErrorCode), instanceType, zone, capacityType)
}

func (u *UnavailableOfferings) MarkCapacityTypeUnavailable(capacityType string) {
u.capacityTypeCache.SetDefault(capacityType, struct{}{})
u.capacityTypeCacheSeqNum.Add(1)
Expand Down
5 changes: 4 additions & 1 deletion pkg/controllers/interruption/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,10 @@ func (c *Controller) handleNodeClaim(ctx context.Context, msg messages.Message,
zone := nodeClaim.Labels[corev1.LabelTopologyZone]
instanceType := nodeClaim.Labels[corev1.LabelInstanceTypeStable]
if zone != "" && instanceType != "" {
c.unavailableOfferingsCache.MarkUnavailable(ctx, string(msg.Kind()), ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot)
unavailableReason := map[string]string{
"reason": string(msg.Kind()),
}
c.unavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot, unavailableReason)
}
}
if action != NoAction {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ type BottlerocketKubernetes struct {
SingleProcessOOMKill *bool `toml:"single-process-oom-kill,omitempty"`
ContainerLogMaxWorkers *int `toml:"container-log-max-workers,omitempty"`
ContainerLogMonitorInterval *string `toml:"container-log-monitor-interval,omitempty"`
HostnameOverrideSource *string `toml:"hostname-override-source,omitempty"`
VerbosityLevel *uint32 `toml:"log-level,omitempty"`
}
type BottlerocketStaticPod struct {
Expand Down
14 changes: 12 additions & 2 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ func (p *DefaultProvider) launchInstance(
}
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w", err), reason, fmt.Sprintf("Error creating fleet request: %s", message))
}
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes)
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes, aws.ToString(createFleetOutput.FleetId))
if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 {
requestID, _ := awsmiddleware.GetRequestIDMetadata(createFleetOutput.ResultMetadata)
return ec2types.CreateFleetInstance{}, serrors.Wrap(
Expand Down Expand Up @@ -481,6 +481,7 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache(
capacityType string,
nodeClaim *karpv1.NodeClaim,
instanceTypes []*cloudprovider.InstanceType,
fleetID string,
) {
for _, err := range errs {
zone := lo.FromPtr(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
Expand All @@ -492,10 +493,19 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache(
if capacityType != karpv1.CapacityTypeReserved {
for _, err := range errs {
if awserrors.IsUnfulfillableCapacity(err) {
p.unavailableOfferings.MarkUnavailableForFleetErr(ctx, err, capacityType)
instanceType := err.LaunchTemplateAndOverrides.Overrides.InstanceType
zone := aws.ToString(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone)
unavailableReason := map[string]string{
"reason": lo.FromPtr(err.ErrorCode),
}
if fleetID != "" {
unavailableReason["fleet-id"] = fleetID
}
p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType, unavailableReason)
}
if awserrors.IsServiceLinkedRoleCreationNotPermitted(err) {
p.unavailableOfferings.MarkCapacityTypeUnavailable(karpv1.CapacityTypeSpot)

p.recorder.Publish(SpotServiceLinkedRoleCreationFailure(nodeClaim))
}
}
Expand Down
12 changes: 6 additions & 6 deletions pkg/providers/instancetype/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2279,8 +2279,8 @@ var _ = Describe("InstanceTypeProvider", func() {
Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6))

// Mark spot m5.xlarge instance as unavailable in a few zones, nothing should change
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"})
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"})
Expect(err).ToNot(HaveOccurred())
m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool {
return it.Name == string(ec2types.InstanceTypeM5Large)
Expand All @@ -2289,7 +2289,7 @@ var _ = Describe("InstanceTypeProvider", func() {
Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6))

// Mark spot m5.large instance in test-zone-1a as unavailable
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"})
instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())
m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool {
Expand All @@ -2303,8 +2303,8 @@ var _ = Describe("InstanceTypeProvider", func() {
}))[0].Available).To(BeFalse())

// Mark on-demand m5.large instance in test-zone-1b and test-zone-1c as unavailable
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"})

instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())
Expand Down Expand Up @@ -2795,7 +2795,7 @@ var _ = Describe("InstanceTypeProvider", func() {
list1, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())

awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot)
awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"})
list2, err := cloudProvider.GetInstanceTypes(ctx, nodePool)
Expect(err).ToNot(HaveOccurred())

Expand Down