@@ -2,12 +2,17 @@ package ssm
22
33import (
44 "fmt"
5+ "math"
6+ "math/rand"
57 "strconv"
68 "strings"
79 "time"
810
911 "github.com/aws/aws-sdk-go/aws"
12+ "github.com/aws/aws-sdk-go/aws/awserr"
13+ "github.com/aws/aws-sdk-go/aws/request"
1014 "github.com/aws/aws-sdk-go/service/ssm"
15+
1116 experimentTypes "github.com/litmuschaos/litmus-go/pkg/aws-ssm/aws-ssm-chaos/types"
1217 "github.com/litmuschaos/litmus-go/pkg/cerrors"
1318 "github.com/litmuschaos/litmus-go/pkg/cloud/aws/common"
@@ -23,6 +28,14 @@ const (
2328 DefaultSSMDocsDirectory = "LitmusChaos-AWS-SSM-Docs.yml"
2429)
2530
31+ // awsErrHasCode checks if an AWS error has a specific error code
32+ func awsErrHasCode (err error , code string ) bool {
33+ if aerr , ok := err .(awserr.Error ); ok {
34+ return aerr .Code () == code
35+ }
36+ return false
37+ }
38+
2639// SendSSMCommand will create and add the ssm document in aws service monitoring docs.
2740func SendSSMCommand (experimentsDetails * experimentTypes.ExperimentDetails , ec2InstanceID []string ) (string , error ) {
2841
@@ -126,48 +139,116 @@ func getSSMCommandStatus(commandID, ec2InstanceID, region string) (string, error
126139 return * cmdOutput .Status , nil
127140}
128141
129- // CheckInstanceInformation will check if the instance has permission to do smm api calls
142+ // CheckInstanceInformation checks if the instance has permission to do SSM API calls,
130143func CheckInstanceInformation (experimentsDetails * experimentTypes.ExperimentDetails ) error {
131-
132144 var instanceIDList []string
145+ var input * ssm.DescribeInstanceInformationInput
146+
133147 switch {
134148 case experimentsDetails .EC2InstanceID != "" :
149+ // If specific instance IDs are provided, use instance ID filter
135150 instanceIDList = strings .Split (experimentsDetails .EC2InstanceID , "," )
151+
152+ input = & ssm.DescribeInstanceInformationInput {
153+ Filters : []* ssm.InstanceInformationStringFilter {
154+ {
155+ Key : aws .String ("InstanceIds" ),
156+ Values : aws .StringSlice (instanceIDList ),
157+ },
158+ },
159+ }
136160 default :
161+ // If using tags, first verify we have valid targets
137162 if err := CheckTargetInstanceStatus (experimentsDetails ); err != nil {
138163 return stacktrace .Propagate (err , "failed to check target instance(s) status" )
139164 }
140165 instanceIDList = experimentsDetails .TargetInstanceIDList
141166
167+ // For filtering by instance IDs that we collected from tags
168+ input = & ssm.DescribeInstanceInformationInput {
169+ Filters : []* ssm.InstanceInformationStringFilter {
170+ {
171+ Key : aws .String ("InstanceIds" ),
172+ Values : aws .StringSlice (instanceIDList ),
173+ },
174+ },
175+ }
142176 }
177+
143178 sesh := common .GetAWSSession (experimentsDetails .Region )
144179 ssmClient := ssm .New (sesh )
145- for _ , ec2ID := range instanceIDList {
146- res , err := ssmClient .DescribeInstanceInformation (& ssm.DescribeInstanceInformationInput {})
180+ var (
181+ foundInstances = make (map [string ]bool )
182+ err error
183+ maxRetries = 5
184+ maxRetryDuration = time .Second * 30
185+ startTime = time .Now ()
186+ )
187+
188+ for attempt := 0 ; attempt < maxRetries ; attempt ++ {
189+ if time .Since (startTime ) > maxRetryDuration {
190+ break
191+ }
192+
193+ err = ssmClient .DescribeInstanceInformationPages (input ,
194+ func (page * ssm.DescribeInstanceInformationOutput , lastPage bool ) bool {
195+ for _ , instanceDetails := range page .InstanceInformationList {
196+ if instanceDetails .InstanceId != nil {
197+ foundInstances [* instanceDetails .InstanceId ] = true
198+ }
199+ }
200+ return true // continue to next page
201+ })
202+
147203 if err != nil {
204+ awsErr := common .CheckAWSError (err )
205+ if request .IsErrorThrottle (err ) ||
206+ awsErrHasCode (awsErr , "ThrottlingException" ) ||
207+ awsErrHasCode (awsErr , "RequestThrottledException" ) ||
208+ awsErrHasCode (awsErr , "Throttling" ) ||
209+ awsErrHasCode (awsErr , "TooManyRequestsException" ) ||
210+ awsErrHasCode (awsErr , "RequestLimitExceeded" ) {
211+
212+ // Calculate exponential backoff with jitter
213+ backoffTime := time .Duration (math .Pow (2 , float64 (attempt ))) * time .Second
214+ rnd := rand .New (rand .NewSource (time .Now ().UnixNano ()))
215+ jitter := time .Duration (rnd .Intn (1000 )) * time .Millisecond
216+ sleepTime := backoffTime + jitter
217+
218+ log .Infof ("AWS API rate limit hit, retrying in %v (attempt %d/%d)" , sleepTime , attempt + 1 , maxRetries )
219+ time .Sleep (sleepTime )
220+ continue
221+ }
222+
148223 return cerrors.Error {
149224 ErrorCode : cerrors .ErrorTypeChaosInject ,
150- Reason : fmt .Sprintf ("failed to get instance information: %v" , common . CheckAWSError ( err ) .Error ()),
151- Target : fmt .Sprintf ("{EC2 Instance ID: %v, Region: %v}" , ec2ID , experimentsDetails .Region ),
225+ Reason : fmt .Sprintf ("failed to get instance information: %v" , awsErr .Error ()),
226+ Target : fmt .Sprintf ("{Region: %v}" , experimentsDetails .Region ),
152227 }
153228 }
154- isInstanceFound := false
155- if len (res .InstanceInformationList ) != 0 {
156- for _ , instanceDetails := range res .InstanceInformationList {
157- if * instanceDetails .InstanceId == ec2ID {
158- isInstanceFound = true
159- break
160- }
161- }
162- if ! isInstanceFound {
163- return cerrors.Error {
164- ErrorCode : cerrors .ErrorTypeChaosInject ,
165- Reason : fmt .Sprintf ("the instance %v might not have suitable permission or IAM attached to it. Run command `aws ssm describe-instance-information` to check for available instances" , ec2ID ),
166- Target : fmt .Sprintf ("{EC2 Instance ID: %v, Region: %v}" , ec2ID , experimentsDetails .Region ),
167- }
229+
230+ break
231+ }
232+
233+ if err != nil {
234+ return cerrors.Error {
235+ ErrorCode : cerrors .ErrorTypeChaosInject ,
236+ Reason : fmt .Sprintf ("failed to get instance information after retries: %v" , common .CheckAWSError (err ).Error ()),
237+ Target : fmt .Sprintf ("{Region: %v}" , experimentsDetails .Region ),
238+ }
239+ }
240+
241+ // Validate that each target instance is present.
242+ for _ , ec2ID := range instanceIDList {
243+ if _ , exists := foundInstances [ec2ID ]; ! exists {
244+ return cerrors.Error {
245+ ErrorCode : cerrors .ErrorTypeChaosInject ,
246+ Reason : fmt .Sprintf ("the instance %v might not have suitable permission or IAM attached to it. Run command `aws ssm describe-instance-information` to check for available instances" , ec2ID ),
247+ Target : fmt .Sprintf ("{EC2 Instance ID: %v, Region: %v}" , ec2ID , experimentsDetails .Region ),
168248 }
169249 }
170250 }
251+
171252 log .Info ("[Info]: The target instance have permission to perform SSM API calls" )
172253 return nil
173254}
0 commit comments