diff --git a/pkg/cache/suite_test.go b/pkg/cache/suite_test.go index ec9797e7687d..8351700e7ed9 100644 --- a/pkg/cache/suite_test.go +++ b/pkg/cache/suite_test.go @@ -19,7 +19,6 @@ import ( "testing" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" - "github.com/samber/lo" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" . "github.com/onsi/ginkgo/v2" @@ -50,58 +49,43 @@ var _ = Describe("Cache", func() { Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.large on-demand should return that it's unavailable when we mark it - unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type - unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.xlarge spot should return that it's unavailable when we mark it - unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"}) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue()) }) - It("should mark offerings as unavailable when calling MarkUnavailableForFleetErr", func() { + It("should mark offerings as unavailable with fleet error reasons", func() { // offerings should initially not be marked as unavailable Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeFalse()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.large on-demand should return that it's unavailable when we mark it - unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{ - LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{ - Overrides: &ec2types.FleetLaunchTemplateOverrides{ - InstanceType: ec2types.InstanceTypeM5Large, - AvailabilityZone: lo.ToPtr("test-zone-1a"), - }, - }, - }, karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{ + "reason": "InsufficientInstanceCapacity", + }) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.xlarge shouldn't return that it's unavailable when marking an unrelated instance type - unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{ - LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{ - Overrides: &ec2types.FleetLaunchTemplateOverrides{ - InstanceType: ec2types.InstanceTypeM5Large, - AvailabilityZone: lo.ToPtr("test-zone-1b"), - }, - }, - }, karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{ + "reason": "InsufficientInstanceCapacity", + }) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeFalse()) // m5.xlarge spot should return that it's unavailable when we mark it - unavailableOfferingCache.MarkUnavailableForFleetErr(ctx, ec2types.CreateFleetError{ - LaunchTemplateAndOverrides: &ec2types.LaunchTemplateAndOverridesResponse{ - Overrides: &ec2types.FleetLaunchTemplateOverrides{ - InstanceType: ec2types.InstanceTypeM5Xlarge, - AvailabilityZone: lo.ToPtr("test-zone-1b"), - }, - }, - }, karpv1.CapacityTypeSpot) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{ + "reason": "InsufficientInstanceCapacity", + }) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand)).To(BeTrue()) Expect(unavailableOfferingCache.IsUnavailable(ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot)).To(BeTrue()) }) @@ -146,12 +130,12 @@ var _ = Describe("Cache", func() { Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 0)) // marking m5.large as unavailable should increase the sequence number for that instance type but not others - unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Large)).To(BeNumerically("==", 1)) Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 0)) // marking m5.xlarge as unavailable should increase the sequence number for that instance type but not others - unavailableOfferingCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeOnDemand) + unavailableOfferingCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Large)).To(BeNumerically("==", 1)) Expect(unavailableOfferingCache.SeqNum(ec2types.InstanceTypeM5Xlarge)).To(BeNumerically("==", 1)) diff --git a/pkg/cache/unavailableofferings.go b/pkg/cache/unavailableofferings.go index fcf9141b435c..2f78caaca183 100644 --- a/pkg/cache/unavailableofferings.go +++ b/pkg/cache/unavailableofferings.go @@ -21,12 +21,10 @@ import ( "sync" "sync/atomic" - "github.com/aws/aws-sdk-go-v2/aws" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" - "github.com/samber/lo" + "sigs.k8s.io/controller-runtime/pkg/log" "github.com/patrickmn/go-cache" - "sigs.k8s.io/controller-runtime/pkg/log" ) // UnavailableOfferings stores any offerings that return ICE (insufficient capacity errors) when @@ -90,26 +88,28 @@ func (u *UnavailableOfferings) IsUnavailable(instanceType ec2types.InstanceType, } // MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings -func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, unavailableReason string, instanceType ec2types.InstanceType, zone, capacityType string) { +func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, instanceType ec2types.InstanceType, zone, capacityType string, unavailableReason map[string]string) { // even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL - log.FromContext(ctx).WithValues( - "reason", unavailableReason, + logValues := []interface{}{ + "reason", unavailableReason["reason"], "instance-type", instanceType, "zone", zone, "capacity-type", capacityType, - "ttl", UnavailableOfferingsTTL).V(1).Info("removing offering from offerings") + "ttl", UnavailableOfferingsTTL, + } + // Add fleetID if provided + key := "fleet-id" + _, ok := unavailableReason[key] + if ok { + logValues = append(logValues, key, unavailableReason[key]) + } + log.FromContext(ctx).WithValues(logValues...).V(1).Info("removing offering from offerings") u.offeringCache.SetDefault(u.key(instanceType, zone, capacityType), struct{}{}) u.offeringCacheSeqNumMu.Lock() u.offeringCacheSeqNum[instanceType]++ u.offeringCacheSeqNumMu.Unlock() } -func (u *UnavailableOfferings) MarkUnavailableForFleetErr(ctx context.Context, fleetErr ec2types.CreateFleetError, capacityType string) { - instanceType := fleetErr.LaunchTemplateAndOverrides.Overrides.InstanceType - zone := aws.ToString(fleetErr.LaunchTemplateAndOverrides.Overrides.AvailabilityZone) - u.MarkUnavailable(ctx, lo.FromPtr(fleetErr.ErrorCode), instanceType, zone, capacityType) -} - func (u *UnavailableOfferings) MarkCapacityTypeUnavailable(capacityType string) { u.capacityTypeCache.SetDefault(capacityType, struct{}{}) u.capacityTypeCacheSeqNum.Add(1) diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 97439baa2ad8..e2f937d57b62 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -220,7 +220,10 @@ func (c *Controller) handleNodeClaim(ctx context.Context, msg messages.Message, zone := nodeClaim.Labels[corev1.LabelTopologyZone] instanceType := nodeClaim.Labels[corev1.LabelInstanceTypeStable] if zone != "" && instanceType != "" { - c.unavailableOfferingsCache.MarkUnavailable(ctx, string(msg.Kind()), ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot) + unavailableReason := map[string]string{ + "reason": string(msg.Kind()), + } + c.unavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceType(instanceType), zone, karpv1.CapacityTypeSpot, unavailableReason) } } if action != NoAction { diff --git a/pkg/providers/amifamily/bootstrap/bottlerocketsettings.go b/pkg/providers/amifamily/bootstrap/bottlerocketsettings.go index a790aedbb2bb..501fb07bf1b5 100644 --- a/pkg/providers/amifamily/bootstrap/bottlerocketsettings.go +++ b/pkg/providers/amifamily/bootstrap/bottlerocketsettings.go @@ -86,6 +86,7 @@ type BottlerocketKubernetes struct { SingleProcessOOMKill *bool `toml:"single-process-oom-kill,omitempty"` ContainerLogMaxWorkers *int `toml:"container-log-max-workers,omitempty"` ContainerLogMonitorInterval *string `toml:"container-log-monitor-interval,omitempty"` + HostnameOverrideSource *string `toml:"hostname-override-source,omitempty"` VerbosityLevel *uint32 `toml:"log-level,omitempty"` } type BottlerocketStaticPod struct { diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index d93275484fbf..a7747468f306 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -350,7 +350,7 @@ func (p *DefaultProvider) launchInstance( } return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w", err), reason, fmt.Sprintf("Error creating fleet request: %s", message)) } - p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes) + p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType, nodeClaim, instanceTypes, aws.ToString(createFleetOutput.FleetId)) if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 { requestID, _ := awsmiddleware.GetRequestIDMetadata(createFleetOutput.ResultMetadata) return ec2types.CreateFleetInstance{}, serrors.Wrap( @@ -481,6 +481,7 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache( capacityType string, nodeClaim *karpv1.NodeClaim, instanceTypes []*cloudprovider.InstanceType, + fleetID string, ) { for _, err := range errs { zone := lo.FromPtr(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone) @@ -492,10 +493,19 @@ func (p *DefaultProvider) updateUnavailableOfferingsCache( if capacityType != karpv1.CapacityTypeReserved { for _, err := range errs { if awserrors.IsUnfulfillableCapacity(err) { - p.unavailableOfferings.MarkUnavailableForFleetErr(ctx, err, capacityType) + instanceType := err.LaunchTemplateAndOverrides.Overrides.InstanceType + zone := aws.ToString(err.LaunchTemplateAndOverrides.Overrides.AvailabilityZone) + unavailableReason := map[string]string{ + "reason": lo.FromPtr(err.ErrorCode), + } + if fleetID != "" { + unavailableReason["fleet-id"] = fleetID + } + p.unavailableOfferings.MarkUnavailable(ctx, instanceType, zone, capacityType, unavailableReason) } if awserrors.IsServiceLinkedRoleCreationNotPermitted(err) { p.unavailableOfferings.MarkCapacityTypeUnavailable(karpv1.CapacityTypeSpot) + p.recorder.Publish(SpotServiceLinkedRoleCreationFailure(nodeClaim)) } } diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index eac01380a7a2..fe53ff49f251 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -2279,8 +2279,8 @@ var _ = Describe("InstanceTypeProvider", func() { Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6)) // Mark spot m5.xlarge instance as unavailable in a few zones, nothing should change - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot) - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"}) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Xlarge, "test-zone-1b", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"}) Expect(err).ToNot(HaveOccurred()) m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool { return it.Name == string(ec2types.InstanceTypeM5Large) @@ -2289,7 +2289,7 @@ var _ = Describe("InstanceTypeProvider", func() { Expect(m5InstanceType.Offerings.Available()).To(HaveLen(6)) // Mark spot m5.large instance in test-zone-1a as unavailable - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"}) instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool) Expect(err).ToNot(HaveOccurred()) m5InstanceType, ok = lo.Find(instanceTypes, func(it *corecloudprovider.InstanceType) bool { @@ -2303,8 +2303,8 @@ var _ = Describe("InstanceTypeProvider", func() { }))[0].Available).To(BeFalse()) // Mark on-demand m5.large instance in test-zone-1b and test-zone-1c as unavailable - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand) - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1b", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, ec2types.InstanceTypeM5Large, "test-zone-1c", karpv1.CapacityTypeOnDemand, map[string]string{"reason": "test"}) instanceTypes, err = cloudProvider.GetInstanceTypes(ctx, nodePool) Expect(err).ToNot(HaveOccurred()) @@ -2795,7 +2795,7 @@ var _ = Describe("InstanceTypeProvider", func() { list1, err := cloudProvider.GetInstanceTypes(ctx, nodePool) Expect(err).ToNot(HaveOccurred()) - awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "test", "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot) + awsEnv.UnavailableOfferingsCache.MarkUnavailable(ctx, "m5.xlarge", "test-zone-1a", karpv1.CapacityTypeSpot, map[string]string{"reason": "test"}) list2, err := cloudProvider.GetInstanceTypes(ctx, nodePool) Expect(err).ToNot(HaveOccurred())