Skip to content

Commit 07de11e

Browse files
ThaSamineelanjan00uditgauravbjokyispeakc0de
authored
Fix: handle pagination in ssm describeInstanceInformation & API Rate Limit (#738)
* Fix: handle pagination in ssm describe Signed-off-by: Sami Shabaneh <[email protected]> * implement exponential backoff with jitter for API rate limiting Signed-off-by: Sami Shabaneh <[email protected]> * Refactor Signed-off-by: Sami Shabaneh <[email protected]> * Update pkg/cloud/aws/ssm/ssm-operations.go Co-authored-by: Neelanjan Manna <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * fixup Signed-off-by: Sami Shabaneh <[email protected]> * Update pkg/cloud/aws/ssm/ssm-operations.go Co-authored-by: Udit Gaurav <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * Fix: include error message from stderr if container-kill fails (#740) (#741) Signed-off-by: Björn Kylberg <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * fix(logs): Fix the error logs for container-kill fault (#745) Signed-off-by: Shubham Chaudhary <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * fix(container-kill): Fixed the container stop command timeout issue (#747) Signed-off-by: Shubham Chaudhary <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * feat: Add a rds-instance-stop chaos fault (#710) * feat: Add a rds-instance-stop chaos fault Signed-off-by: Jongwoo Han <[email protected]> --------- Signed-off-by: Jongwoo Han <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * Update pkg/cloud/aws/ssm/ssm-operations.go Signed-off-by: Sami Shabaneh <[email protected]> * fix go fmt ./... Signed-off-by: Udit Gaurav <[email protected]> Signed-off-by: Sami Shabaneh <[email protected]> * Filter instances on api call Signed-off-by: Sami Shabaneh <[email protected]> * fixes lint Signed-off-by: Udit Gaurav <[email protected]> --------- Signed-off-by: Sami Shabaneh <[email protected]> Signed-off-by: Björn Kylberg <[email protected]> Signed-off-by: Shubham Chaudhary <[email protected]> Signed-off-by: Jongwoo Han <[email protected]> Signed-off-by: Udit Gaurav <[email protected]> Co-authored-by: Neelanjan Manna <[email protected]> Co-authored-by: Udit Gaurav <[email protected]> Co-authored-by: Björn Kylberg <[email protected]> Co-authored-by: Shubham Chaudhary <[email protected]> Co-authored-by: Jongwoo Han <[email protected]> Co-authored-by: Udit Gaurav <[email protected]>
1 parent 5c22472 commit 07de11e

File tree

1 file changed

+101
-20
lines changed

1 file changed

+101
-20
lines changed

pkg/cloud/aws/ssm/ssm-operations.go

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@ package ssm
22

33
import (
44
"fmt"
5+
"math"
6+
"math/rand"
57
"strconv"
68
"strings"
79
"time"
810

911
"github.com/aws/aws-sdk-go/aws"
12+
"github.com/aws/aws-sdk-go/aws/awserr"
13+
"github.com/aws/aws-sdk-go/aws/request"
1014
"github.com/aws/aws-sdk-go/service/ssm"
15+
1116
experimentTypes "github.com/litmuschaos/litmus-go/pkg/aws-ssm/aws-ssm-chaos/types"
1217
"github.com/litmuschaos/litmus-go/pkg/cerrors"
1318
"github.com/litmuschaos/litmus-go/pkg/cloud/aws/common"
@@ -23,6 +28,14 @@ const (
2328
DefaultSSMDocsDirectory = "LitmusChaos-AWS-SSM-Docs.yml"
2429
)
2530

31+
// awsErrHasCode checks if an AWS error has a specific error code
32+
func awsErrHasCode(err error, code string) bool {
33+
if aerr, ok := err.(awserr.Error); ok {
34+
return aerr.Code() == code
35+
}
36+
return false
37+
}
38+
2639
// SendSSMCommand will create and add the ssm document in aws service monitoring docs.
2740
func SendSSMCommand(experimentsDetails *experimentTypes.ExperimentDetails, ec2InstanceID []string) (string, error) {
2841

@@ -126,48 +139,116 @@ func getSSMCommandStatus(commandID, ec2InstanceID, region string) (string, error
126139
return *cmdOutput.Status, nil
127140
}
128141

129-
// CheckInstanceInformation will check if the instance has permission to do smm api calls
142+
// CheckInstanceInformation checks if the instance has permission to do SSM API calls,
130143
func CheckInstanceInformation(experimentsDetails *experimentTypes.ExperimentDetails) error {
131-
132144
var instanceIDList []string
145+
var input *ssm.DescribeInstanceInformationInput
146+
133147
switch {
134148
case experimentsDetails.EC2InstanceID != "":
149+
// If specific instance IDs are provided, use instance ID filter
135150
instanceIDList = strings.Split(experimentsDetails.EC2InstanceID, ",")
151+
152+
input = &ssm.DescribeInstanceInformationInput{
153+
Filters: []*ssm.InstanceInformationStringFilter{
154+
{
155+
Key: aws.String("InstanceIds"),
156+
Values: aws.StringSlice(instanceIDList),
157+
},
158+
},
159+
}
136160
default:
161+
// If using tags, first verify we have valid targets
137162
if err := CheckTargetInstanceStatus(experimentsDetails); err != nil {
138163
return stacktrace.Propagate(err, "failed to check target instance(s) status")
139164
}
140165
instanceIDList = experimentsDetails.TargetInstanceIDList
141166

167+
// For filtering by instance IDs that we collected from tags
168+
input = &ssm.DescribeInstanceInformationInput{
169+
Filters: []*ssm.InstanceInformationStringFilter{
170+
{
171+
Key: aws.String("InstanceIds"),
172+
Values: aws.StringSlice(instanceIDList),
173+
},
174+
},
175+
}
142176
}
177+
143178
sesh := common.GetAWSSession(experimentsDetails.Region)
144179
ssmClient := ssm.New(sesh)
145-
for _, ec2ID := range instanceIDList {
146-
res, err := ssmClient.DescribeInstanceInformation(&ssm.DescribeInstanceInformationInput{})
180+
var (
181+
foundInstances = make(map[string]bool)
182+
err error
183+
maxRetries = 5
184+
maxRetryDuration = time.Second * 30
185+
startTime = time.Now()
186+
)
187+
188+
for attempt := 0; attempt < maxRetries; attempt++ {
189+
if time.Since(startTime) > maxRetryDuration {
190+
break
191+
}
192+
193+
err = ssmClient.DescribeInstanceInformationPages(input,
194+
func(page *ssm.DescribeInstanceInformationOutput, lastPage bool) bool {
195+
for _, instanceDetails := range page.InstanceInformationList {
196+
if instanceDetails.InstanceId != nil {
197+
foundInstances[*instanceDetails.InstanceId] = true
198+
}
199+
}
200+
return true // continue to next page
201+
})
202+
147203
if err != nil {
204+
awsErr := common.CheckAWSError(err)
205+
if request.IsErrorThrottle(err) ||
206+
awsErrHasCode(awsErr, "ThrottlingException") ||
207+
awsErrHasCode(awsErr, "RequestThrottledException") ||
208+
awsErrHasCode(awsErr, "Throttling") ||
209+
awsErrHasCode(awsErr, "TooManyRequestsException") ||
210+
awsErrHasCode(awsErr, "RequestLimitExceeded") {
211+
212+
// Calculate exponential backoff with jitter
213+
backoffTime := time.Duration(math.Pow(2, float64(attempt))) * time.Second
214+
rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
215+
jitter := time.Duration(rnd.Intn(1000)) * time.Millisecond
216+
sleepTime := backoffTime + jitter
217+
218+
log.Infof("AWS API rate limit hit, retrying in %v (attempt %d/%d)", sleepTime, attempt+1, maxRetries)
219+
time.Sleep(sleepTime)
220+
continue
221+
}
222+
148223
return cerrors.Error{
149224
ErrorCode: cerrors.ErrorTypeChaosInject,
150-
Reason: fmt.Sprintf("failed to get instance information: %v", common.CheckAWSError(err).Error()),
151-
Target: fmt.Sprintf("{EC2 Instance ID: %v, Region: %v}", ec2ID, experimentsDetails.Region),
225+
Reason: fmt.Sprintf("failed to get instance information: %v", awsErr.Error()),
226+
Target: fmt.Sprintf("{Region: %v}", experimentsDetails.Region),
152227
}
153228
}
154-
isInstanceFound := false
155-
if len(res.InstanceInformationList) != 0 {
156-
for _, instanceDetails := range res.InstanceInformationList {
157-
if *instanceDetails.InstanceId == ec2ID {
158-
isInstanceFound = true
159-
break
160-
}
161-
}
162-
if !isInstanceFound {
163-
return cerrors.Error{
164-
ErrorCode: cerrors.ErrorTypeChaosInject,
165-
Reason: fmt.Sprintf("the instance %v might not have suitable permission or IAM attached to it. Run command `aws ssm describe-instance-information` to check for available instances", ec2ID),
166-
Target: fmt.Sprintf("{EC2 Instance ID: %v, Region: %v}", ec2ID, experimentsDetails.Region),
167-
}
229+
230+
break
231+
}
232+
233+
if err != nil {
234+
return cerrors.Error{
235+
ErrorCode: cerrors.ErrorTypeChaosInject,
236+
Reason: fmt.Sprintf("failed to get instance information after retries: %v", common.CheckAWSError(err).Error()),
237+
Target: fmt.Sprintf("{Region: %v}", experimentsDetails.Region),
238+
}
239+
}
240+
241+
// Validate that each target instance is present.
242+
for _, ec2ID := range instanceIDList {
243+
if _, exists := foundInstances[ec2ID]; !exists {
244+
return cerrors.Error{
245+
ErrorCode: cerrors.ErrorTypeChaosInject,
246+
Reason: fmt.Sprintf("the instance %v might not have suitable permission or IAM attached to it. Run command `aws ssm describe-instance-information` to check for available instances", ec2ID),
247+
Target: fmt.Sprintf("{EC2 Instance ID: %v, Region: %v}", ec2ID, experimentsDetails.Region),
168248
}
169249
}
170250
}
251+
171252
log.Info("[Info]: The target instance have permission to perform SSM API calls")
172253
return nil
173254
}

0 commit comments

Comments
 (0)