Skip to content

Commit 8aed1d0

Browse files
authored
Ensure we do the SecurityGroup swap for instance cleanup on KMS failures (#292)
1 parent 8a6139a commit 8aed1d0

File tree

2 files changed

+66
-52
lines changed

2 files changed

+66
-52
lines changed

pkg/verifier/aws/aws_verifier.go

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,6 @@ const (
6969
// This corresponds with the quay tag: v0.1.90-f2e86a9
7070
networkValidatorImage = "quay.io/app-sre/osd-network-verifier@sha256:137bf177c2e87732b2692c1af39d3b79b2f84c7f0ee9254df4ea4412dddfab1e"
7171
networkValidatorRepo = "quay.io/app-sre/osd-network-verifier"
72-
userdataEndVerifier = "USERDATA END"
73-
prepulledImageMessage = "Warning: could not pull the specified docker image, will try to use the prepulled one"
7472
invalidKMSCode = "Client.InvalidKMSKey.InvalidState"
7573
)
7674

@@ -79,7 +77,7 @@ type AwsVerifier struct {
7977
AwsClient *aws.Client
8078
Logger ocmlog.Logger
8179
Output output.Output
82-
// This cache is only to be used inside of describeInstanceType() to minimize nil ptr error risk
80+
// This cache is only to be used inside describeInstanceType() to minimize nil ptr error risk
8381
cachedInstanceTypeInfo *ec2Types.InstanceTypeInfo
8482
}
8583

@@ -158,7 +156,7 @@ func (a *AwsVerifier) describeInstanceType(ctx context.Context, instanceType str
158156
return a.cachedInstanceTypeInfo, nil
159157
}
160158

161-
// instanceTypeUsesNitro asks the AWS API whether or not the provided instanceType uses the "Nitro"
159+
// instanceTypeUsesNitro asks the AWS API whether the provided instanceType uses the "Nitro"
162160
// hypervisor. Nitro is the only hypervisor supporting serial console output, which we need to
163161
// collect in order to gather probe results
164162
func (a *AwsVerifier) instanceTypeUsesNitro(ctx context.Context, instanceType string) (bool, error) {
@@ -169,7 +167,7 @@ func (a *AwsVerifier) instanceTypeUsesNitro(ctx context.Context, instanceType st
169167
}
170168

171169
// Return true if instance type uses nitro
172-
return (instanceTypeInfo.Hypervisor == ec2Types.InstanceTypeHypervisorNitro), nil
170+
return instanceTypeInfo.Hypervisor == ec2Types.InstanceTypeHypervisorNitro, nil
173171
}
174172

175173
// instanceTypeArchitecture asks the AWS API about the CPU architecture(s) supported by the provided
@@ -256,6 +254,7 @@ type createEC2InstanceInput struct {
256254
tags map[string]string
257255
ctx context.Context
258256
keyPair string
257+
vpcID string
259258
}
260259

261260
func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, error) {
@@ -339,16 +338,16 @@ func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, e
339338

340339
// Wait up to 5 minutes for the instance to be running
341340
waiter := ec2.NewInstanceRunningWaiter(a.AwsClient)
342-
if err := waiter.Wait(input.ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, 2*time.Minute); err != nil {
343-
resp, err := a.AwsClient.DescribeInstances(context.TODO(), &ec2.DescribeInstancesInput{
341+
if err := waiter.Wait(input.ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, 5*time.Minute); err != nil {
342+
resp, err := a.AwsClient.DescribeInstances(input.ctx, &ec2.DescribeInstancesInput{
344343
InstanceIds: []string{instanceID},
345344
})
346345
if err != nil {
347346
fmt.Println("Warning: Waiter Describe instances failure.")
348347
}
349348

350349
var stateCode string
351-
if resp.Reservations[0].Instances[0].StateReason.Code != nil {
350+
if resp != nil && resp.Reservations[0].Instances[0].StateReason.Code != nil {
352351
stateCode = *resp.Reservations[0].Instances[0].StateReason.Code
353352
}
354353

@@ -357,6 +356,17 @@ func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, e
357356
waiterErr = handledErrors.NewKmsError("encountered issue accessing KMS key when launching instance.")
358357
}
359358

359+
// Switch the instance SecurityGroup to the default before terminating to avoid a cleanup race condition. This is
360+
// handled by the normal cleanup process, except in this specific case where we fail early because of KMS issues.
361+
defaultSecurityGroupID := a.fetchVpcDefaultSecurityGroup(input.ctx, input.vpcID)
362+
if defaultSecurityGroupID != "" {
363+
// Replace the SecurityGroup attached to the instance with the default one for the VPC to allow for graceful
364+
// termination of the network-verifier created temporary SecurityGroup. If we hit an error, we ignore it
365+
// and continue with normal termination of the instance.
366+
_ = a.modifyInstanceSecurityGroup(input.ctx, instanceID, defaultSecurityGroupID)
367+
a.Logger.Info(input.ctx, "Modified the instance to use the default security group")
368+
}
369+
360370
if err := a.AwsClient.TerminateEC2Instance(input.ctx, instanceID); err != nil {
361371
return instanceID, handledErrors.NewGenericError(err)
362372
}
@@ -491,16 +501,16 @@ func (a *AwsVerifier) CreateSecurityGroup(ctx context.Context, tags map[string]s
491501

492502
a.Logger.Info(ctx, "Created security group with ID: %s", *output.GroupId)
493503

494-
input_rules := &ec2.AuthorizeSecurityGroupEgressInput{
504+
inputRules := &ec2.AuthorizeSecurityGroupEgressInput{
495505
GroupId: output.GroupId,
496506
IpPermissions: defaultIpPermissions,
497507
}
498508

499-
if _, err := a.AwsClient.AuthorizeSecurityGroupEgress(ctx, input_rules); err != nil {
509+
if _, err := a.AwsClient.AuthorizeSecurityGroupEgress(ctx, inputRules); err != nil {
500510
return &ec2.CreateSecurityGroupOutput{}, err
501511
}
502512

503-
revoke_default_egress := &ec2.RevokeSecurityGroupEgressInput{
513+
revokeDefaultEgress := &ec2.RevokeSecurityGroupEgressInput{
504514
GroupId: output.GroupId,
505515
IpPermissions: []ec2Types.IpPermission{
506516
{
@@ -516,7 +526,7 @@ func (a *AwsVerifier) CreateSecurityGroup(ctx context.Context, tags map[string]s
516526
},
517527
}
518528

519-
if _, err := a.AwsClient.RevokeSecurityGroupEgress(ctx, revoke_default_egress); err != nil {
529+
if _, err := a.AwsClient.RevokeSecurityGroupEgress(ctx, revokeDefaultEgress); err != nil {
520530
return &ec2.CreateSecurityGroupOutput{}, err
521531
}
522532

@@ -694,3 +704,40 @@ func (a *AwsVerifier) GetVpcIdFromSubnetId(ctx context.Context, vpcSubnetID stri
694704
}
695705
return vpcId, nil
696706
}
707+
708+
// fetchVpcDefaultSecurityGroup will return either the 'default' SG ID, or an empty string if not found/an error is hit
709+
func (a *AwsVerifier) fetchVpcDefaultSecurityGroup(ctx context.Context, vpcId string) string {
710+
describeSGOutput, err := a.AwsClient.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
711+
Filters: []ec2Types.Filter{
712+
{
713+
Name: awsTools.String("vpc-id"),
714+
Values: []string{vpcId},
715+
},
716+
{
717+
Name: awsTools.String("group-name"),
718+
Values: []string{"default"},
719+
},
720+
},
721+
})
722+
723+
if err != nil {
724+
return ""
725+
}
726+
727+
for _, SG := range describeSGOutput.SecurityGroups {
728+
if *SG.GroupName == "default" {
729+
return *SG.GroupId
730+
}
731+
}
732+
733+
return ""
734+
}
735+
736+
func (a *AwsVerifier) modifyInstanceSecurityGroup(ctx context.Context, instanceID string, securityGroupID string) error {
737+
_, err := a.AwsClient.ModifyInstanceAttribute(ctx, &ec2.ModifyInstanceAttributeInput{
738+
InstanceId: &instanceID,
739+
Groups: []string{securityGroupID},
740+
})
741+
742+
return err
743+
}

pkg/verifier/aws/entry_point.go

Lines changed: 7 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,6 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
180180

181181
// If security group not given, create a temporary one
182182
if len(vei.AWS.SecurityGroupIDs) == 0 || vei.ForceTempSecurityGroup {
183-
184183
createSecurityGroupOutput, err := a.CreateSecurityGroup(vei.Ctx, vei.Tags, "osd-network-verifier", vpcId)
185184
if err != nil {
186185
return a.Output.AddError(err)
@@ -208,7 +207,6 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
208207
return a.Output.AddError(err)
209208
}
210209
}
211-
212210
}
213211

214212
// Create EC2 instance
@@ -224,6 +222,7 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
224222
securityGroupIDs: vei.AWS.SecurityGroupIDs,
225223
tempSecurityGroupID: vei.AWS.TempSecurityGroup,
226224
keyPair: vei.ImportKeyPair,
225+
vpcID: vpcId,
227226
})
228227
if err != nil {
229228
return a.Output.AddError(err)
@@ -246,48 +245,16 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
246245

247246
// Terminate the EC2 instance (unless user requests otherwise)
248247
if !vei.SkipInstanceTermination {
249-
// Replaced the SGs attached to the network-verifier-instance by the default SG in order to allow
250-
// deletion of temporary SGs created
251-
252-
// Getting a list of the SGs for the current VPC of our instance
253-
var defaultSecurityGroupID = ""
254-
describeSGOutput, err := a.AwsClient.DescribeSecurityGroups(vei.Ctx, &ec2.DescribeSecurityGroupsInput{
255-
Filters: []ec2Types.Filter{
256-
{
257-
Name: awsTools.String("vpc-id"),
258-
Values: []string{vpcId},
259-
},
260-
{
261-
Name: awsTools.String("group-name"),
262-
Values: []string{"default"},
263-
},
264-
},
265-
})
266-
if err != nil {
267-
a.Output.AddError(err)
268-
a.Logger.Info(vei.Ctx, "Unable to describe security groups. Falling back to slower cloud resource cleanup method.")
269-
270-
}
271-
272-
if describeSGOutput != nil {
273-
274-
//Fetch default Security Group ID.
275-
for _, SG := range describeSGOutput.SecurityGroups {
276-
if *SG.GroupName == "default" {
277-
defaultSecurityGroupID = *SG.GroupId
278-
}
279-
}
280-
281-
//Replacing the SGs attach to instance by the default one. This is to clean the SGs created in case the instance
282-
//termination times out
283-
_, err = a.AwsClient.ModifyInstanceAttribute(vei.Ctx, &ec2.ModifyInstanceAttributeInput{
284-
InstanceId: &instanceID,
285-
Groups: []string{defaultSecurityGroupID},
286-
})
248+
// Replace the SecurityGroup attached to the instance with the default one for the VPC to allow for graceful
249+
// termination of the network-verifier created temporary SecurityGroup
250+
defaultSecurityGroupID := a.fetchVpcDefaultSecurityGroup(vei.Ctx, vpcId)
251+
if defaultSecurityGroupID != "" {
252+
err = a.modifyInstanceSecurityGroup(vei.Ctx, instanceID, defaultSecurityGroupID)
287253
if err != nil {
288254
a.Logger.Info(vei.Ctx, "Unable to detach instance from security group. Falling back to slower cloud resource cleanup method.")
289255
a.writeDebugLogs(vei.Ctx, fmt.Sprintf("Fell back to slower cloud resource cleanup because faster method (network interface detatchment) blocked by AWS: %s.", err))
290256
}
257+
a.Logger.Info(vei.Ctx, "Modified the instance to use the default security group")
291258
}
292259

293260
a.Logger.Info(vei.Ctx, "Deleting instance with ID: %s", instanceID)

0 commit comments

Comments
 (0)