-
Notifications
You must be signed in to change notification settings - Fork 283
/
Copy pathsles4sap_publiccloud.pm
1392 lines (1093 loc) · 47.9 KB
/
sles4sap_publiccloud.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SUSE's openQA tests
#
# Copyright SUSE LLC
# SPDX-License-Identifier: FSFAP
#
# Summary: Library used for SLES4SAP publiccloud deployment and tests
#
# Note: Subroutines executing commands on remote host (using "run_cmd" or "run_ssh_command") require
# to have $self->{my_instance} defined.
# $self->{my_instance} defines what is the target instance to execute code on. It is acquired from
# data located in "@instances" and produced by deployment test modules.
package sles4sap_publiccloud;
use base 'publiccloud::basetest';
use strict;
use JSON;
use warnings FATAL => 'all';
use Exporter 'import';
use Scalar::Util qw(looks_like_number);
use List::MoreUtils qw(uniq);
use Carp qw(croak);
use YAML::PP;
use testapi;
use utils qw(file_content_replace);
use serial_terminal qw(serial_term_prompt);
use version_utils qw(check_version is_sle);
use hacluster;
use qesapdeployment;
use publiccloud::utils;
use publiccloud::provider;
use publiccloud::ssh_interactive qw(select_host_console);
use publiccloud::instance;
use sles4sap;
use saputils;
our @EXPORT = qw(
run_cmd
get_promoted_hostname
is_hana_resource_running
stop_hana
start_hana
check_takeover
get_replication_info
is_hana_online
get_hana_topology
enable_replication
cleanup_resource
get_promoted_instance
wait_for_sync
wait_for_pacemaker
cloud_file_content_replace
change_sbd_service_timeout
setup_sbd_delay_publiccloud
sbd_delay_formula
create_instance_data
deployment_name
delete_network_peering
create_playbook_section_list
create_hana_vars_section
azure_fencing_agents_playbook_args
display_full_status
list_cluster_nodes
sles4sap_cleanup
is_hana_database_online
get_hana_database_status
is_primary_node_online
pacemaker_version
saphanasr_showAttr_version
get_hana_site_names
wait_for_cluster
wait_for_zypper
wait_for_idle
);
=head1 DESCRIPTION
Package with common methods and default or constant values for sles4sap tests in the cloud
=head2 run_cmd
run_cmd(cmd => 'command', [runas => 'user', timeout => 60]);
Runs a command C<cmd> via ssh in the given VM and log the output.
All commands are executed through C<sudo>.
If 'runas' defined, command will be executed as specified user,
otherwise it will be executed as root.
=over
=item B<cmd> - command string to be executed remotely
=item B<timeout> - command execution timeout
=item B<title> - used in record_info
=item B<runas> - pre-pend the command with su to execute it as specific user
=item B<...> - pass through all other arguments supported by run_ssh_command
=back
=cut
sub run_cmd {
my ($self, %args) = @_;
croak("Argument <cmd> missing") unless $args{cmd};
croak("\$self->{my_instance} is not defined. Check module Description for details")
unless $self->{my_instance};
my $timeout = bmwqemu::scale_timeout($args{timeout} // 60);
my $title = $args{title} // $args{cmd};
$title =~ s/[[:blank:]].+// unless defined $args{title};
my $cmd = defined($args{runas}) ? "su - $args{runas} -c '$args{cmd}'" : "$args{cmd}";
# Without cleaning up variables SSH commands get executed under wrong user
delete($args{cmd});
delete($args{title});
delete($args{timeout});
delete($args{runas});
$self->{my_instance}->wait_for_ssh(timeout => $timeout);
my $out = $self->{my_instance}->run_ssh_command(cmd => "sudo $cmd", timeout => $timeout, %args);
record_info("$title output - $self->{my_instance}->{instance_id}", $out) unless ($timeout == 0 or $args{quiet} or $args{rc_only});
return $out;
}
=head2 get_promoted_hostname()
get_promoted_hostname();
Checks and returns hostname of HANA promoted node according to crm shell output.
=cut
sub get_promoted_hostname {
my ($self) = @_;
my $master_resource_type = get_var('USE_SAP_HANA_SR_ANGI') ? "mst" : "msl";
my $hana_resource = join("_",
$master_resource_type,
"SAPHanaCtl",
get_required_var("INSTANCE_SID"),
"HDB" . get_required_var("INSTANCE_ID"));
my $resource_output = $self->run_cmd(cmd => "crm resource status " . $hana_resource, quiet => 1);
record_info("crm out", $resource_output);
my @master = $resource_output =~ /:\s(\S+)\sMaster/g;
if (scalar @master != 1) {
diag("Master database not found or command returned abnormal output.\n
Check 'crm resource status' command output below:\n");
diag($resource_output);
die("Master database was not found, check autoinst.log");
}
return join("", @master);
}
=head2 sles4sap_cleanup
Clean up Network peering and qesap deployment
This method does not internally die and try to execute
terraform destroy in any case.
Return 0 if no internal error.
=over
=item B<cleanup_called> - flag to indicate cleanup status
=item B<network_peering_present> - flag to indicate network peering presence
=item B<ansible_present> - flag to indicate ansible has need executed as part of the deployment
=back
=cut
sub sles4sap_cleanup {
my ($self, %args) = @_;
record_info(
'Cleanup',
join(' ',
'cleanup_called:', $args{cleanup_called} // 'undefined',
'network_peering_present:', $args{network_peering_present} // 'undefined',
'ansible_present:', $args{ansible_present} // 'undefined'));
# Do not run destroy if already executed
return 0 if ($args{cleanup_called});
# If there's an open ssh connection to the VMs, return to host console first
select_host_console(force => 1);
# ETX is the same as pressing Ctrl-C on a terminal,
# make sure the serial terminal is NOT blocked
type_string('', terminate_with => 'ETX');
qesap_cluster_logs();
qesap_upload_logs();
upload_logs('/var/tmp/ssh_sut.log', failok => 1, log_name => 'ssh_sut_log.txt');
if ($args{network_peering_present}) {
delete_network_peering();
}
my @cmd_list;
# Only run the Ansible de-register if Ansible has been executed
push(@cmd_list, 'ansible') if ($args{ansible_present});
# Regardless of Ansible result, Terraform destroy
# must be executed.
push(@cmd_list, 'terraform');
my $ret = 0;
for my $command (@cmd_list) {
record_info('Cleanup', "Executing $command cleanup");
# 3 attempts for both terraform and ansible cleanup
for (1 .. 3) {
my @cleanup_cmd_rc = qesap_execute(
verbose => '--verbose',
logname => join('_', 'qesap', $command, 'destroy', $_, 'log.txt'),
cmd => $command,
cmd_options => '-d',
timeout => 1200
);
if ($cleanup_cmd_rc[0] == 0) {
diag(ucfirst($command) . " cleanup attempt # $_ PASSED.");
record_info("Clean $command",
ucfirst($command) . ' cleanup PASSED.');
last;
}
else {
diag(ucfirst($command) . " cleanup attempt # $_ FAILED.");
sleep 10;
}
record_info(
'Cleanup FAILED',
"Cleanup $command FAILED",
result => 'fail'
) if $_ == 3 && $cleanup_cmd_rc[0];
$ret = 1 if $_ == 3 && $cleanup_cmd_rc[0];
}
}
record_info('Cleanup finished', "ret:$ret");
return $ret;
}
=head2 get_hana_topology
Parses command output, returns hash of hashes containing values for each host.
=cut
sub get_hana_topology {
my ($self) = @_;
$self->wait_for_idle(timeout => 240);
my $cmd_out = $self->run_cmd(cmd => 'SAPHanaSR-showAttr --format=script', quiet => 1);
return calculate_hana_topology(input => $cmd_out);
}
=head2 is_hana_online
is_hana_online([timeout => 120, wait_for_start => 'false']);
Check if hana DB is online.
=over
=item B<wait_for_start> - Define 'wait_for_start' to wait for DB to start.
=item B<timeout> - timeout for the wait of the online state
=back
=cut
sub is_hana_online {
my ($self, %args) = @_;
$args{wait_for_start} //= 0;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 120);
my $start_time = time;
my $consecutive_passes = 0;
my $db_status;
while ($consecutive_passes < 3) {
$db_status = $self->get_replication_info()->{online} eq "true" ? 1 : 0;
return $db_status unless $args{wait_for_start};
# Reset pass counter in case of fail.
$consecutive_passes = $db_status ? ++$consecutive_passes : 0;
die("DB did not start within defined timeout: $timeout s") if (time - $start_time > $timeout);
sleep 10;
}
return $db_status;
}
=head2 is_hana_resource_running
is_hana_resource_running([quiet => 0]);
Checks if resource msl_SAPHanaCtl_* is running on given node.
=over
=item B<quiet> - if set, returns the value without recording info (default: 0)
=back
=cut
sub is_hana_resource_running {
my ($self, %args) = @_;
$args{quiet} //= 0;
my $hostname = $self->{my_instance}->{instance_id};
my $master_resource_type = get_var('USE_SAP_HANA_SR_ANGI') ? "mst" : "msl";
my $hana_resource = join("_",
$master_resource_type,
"SAPHanaCtl",
get_required_var("INSTANCE_SID"),
"HDB" . get_required_var("INSTANCE_ID"));
my $resource_output = $self->run_cmd(cmd => "crm resource status " . $hana_resource, quiet => 1);
if ($resource_output =~ /is running on: \Q$hostname\E/) {
record_info("Node status", "$hana_resource is running on $hostname") unless $args{quiet};
return 1;
} else {
record_info("Node status", "$hana_resource is NOT running on $hostname") unless $args{quiet};
return 0;
}
}
=head2 is_hana_node_up
is_hana_node_up($my_instance, [timeout => 900]);
Waits until 'is_system_running' returns successfully on the target instance.
=over
=item B<instance> - the instance the test needs to wait for
=item B<timeout> - how much time to wait for before aborting
=back
=cut
sub wait_hana_node_up {
my ($instance, %args) = @_;
$args{timeout} //= 900;
my $start_time = time();
my $out;
while ((time() - $start_time) < $args{timeout}) {
$out = $instance->run_ssh_command(
cmd => "sudo systemctl is-system-running",
timeout => 5,
proceed_on_failure => 1);
return if ($out =~ m/running/);
if ($out =~ m/degraded/) {
my $failed_service = $instance->run_ssh_command(cmd => 'sudo systemctl --failed', timeout => 600, proceed_on_failure => 1);
if ($out =~ /degraded/ && $failed_service =~ /guestregister/) {
record_soft_failure('bsc#1238152 - Restart guestregister service');
$instance->run_ssh_command(cmd => "sudo systemctl restart guestregister.service", timeout => 600, proceed_on_failure => 1);
}
}
record_info("WAIT_FOR_SYSTEM", "System state: $out");
sleep 10;
}
$instance->run_ssh_command(
cmd => 'sudo systemctl --failed',
proceed_on_failure => 1);
die "Timeout reached. is_system_running returns \"$out\"";
}
=head2 stop_hana
stop_hana([timeout => $timeout, method => $method]);
Stops HANA database using default or specified method.
"stop" - stops database using "HDB stop" command.
"kill" - kills database processes using "HDB -kill" command.
"crash" - crashes entire OS using "/proc/sysrq-trigger" method.
=over
=item B<method> - Allow to specify a specific stop method
=item B<timeout> - only used for stop and kill
=back
=cut
sub stop_hana {
my ($self, %args) = @_;
$args{method} //= 'stop';
my $timeout = bmwqemu::scale_timeout($args{timeout} // 300);
my %commands = (
stop => 'HDB stop',
kill => 'HDB kill -x',
# echo b > /proc/sysrq-trigger is for crashing the remote node
# This also work in conjunction with ssh -fn arguments
crash => 'sudo su -c "echo b > /proc/sysrq-trigger &"'
);
croak("HANA stop method '$args{method}' unknown.") unless $commands{$args{method}};
my $cmd = $commands{$args{method}};
# Wait for data sync before stopping DB
$self->wait_for_sync();
record_info("Stopping HANA", "CMD:$cmd");
if ($args{method} eq "crash") {
# Crash needs to be executed as root and wait for host reboot
# Ensure the remote node is in a normal state before to trigger the crash
$self->{my_instance}->wait_for_ssh(timeout => $timeout, scan_ssh_host_key => 1);
$self->{my_instance}->run_ssh_command(cmd => "sudo su -c sync", timeout => $timeout);
# Create a local copy of ssh_opts and extend it for the crash command.
# Extension is on top of values defined in the current instance class $self->{my_instance}->ssh_opts
# which in HanaSR tests are set with default values in sles4sap_publiccloud_basetest::set_cli_ssh_opts.
# -f requests ssh to go to background just before command execution
# -n is about stdin redirection and it is needed by -f to work
my $crash_ssh_opts = $self->{my_instance}->ssh_opts . ' -fn -o ServerAliveInterval=2';
$self->{my_instance}->run_ssh_command(
cmd => $cmd,
# This timeout is to ensure the run_ssh_command is executed in a reasonable amount of time.
# It is not about how much time the crash is expected to take in the SUT.
# Also consider that internally run_ssh_command is using this value for two different guard mechanisms.
timeout => 10,
# This test does not care about output,
# setting this in conjunction with timeout >0 result in the internal implementation of
# run_ssh_command to use script_run
rc_only => 1,
ssh_opts => $crash_ssh_opts);
# Wait till ssh port 22 disappear
record_info('Wait ssh disappear', 'START');
my $start_time = time();
my $exit_code;
my $nc_error_occurrence = 0;
while (((time() - $start_time) < 900) and ($nc_error_occurrence < 5)) {
$exit_code = script_run('nc -vz -w 1 ' . $self->{my_instance}->{public_ip} . ' 22', quiet => 1);
$nc_error_occurrence += 1 if (!defined($exit_code) or $exit_code ne 0);
}
my $end_msg = join("\n",
'END',
"started at $start_time",
'elapsed:' . (time() - $start_time),
"nc_error_occurrence:$nc_error_occurrence",
"last exit_code:$exit_code");
record_info('Wait ssh disappear', $end_msg);
# wait for node to be ready
wait_hana_node_up($self->{my_instance}, timeout => 900);
record_info("Wait ssh is back again");
}
else {
my $sapadmin = lc(get_required_var('INSTANCE_SID')) . 'adm';
$self->run_cmd(cmd => $cmd, runas => $sapadmin, timeout => $timeout);
$self->{my_instance}->wait_for_ssh(username => 'cloudadmin', scan_ssh_host_key => 1);
}
}
=head2 start_hana
start_hana([timeout => 60]);
Start HANA DB using "HDB start" command
=cut
sub start_hana {
my ($self) = @_;
$self->run_cmd(cmd => "HDB start", runas => get_required_var("SAP_SIDADM"));
}
=head2 cleanup_resource
cleanup_resource([timeout => 60]);
Cleanup resource 'msl_SAPHanaCtl_*', wait for DB start automatically.
=over
=item B<timeout> - timeout for waiting resource to start
=back
=cut
sub cleanup_resource {
my ($self, %args) = @_;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 300);
$self->run_cmd(cmd => "crm resource cleanup");
# Wait for resource to start
my $start_time = time;
while ($self->is_hana_resource_running() == 0) {
if (time - $start_time > $timeout) {
record_info("Cluster status", $self->run_cmd(cmd => $crm_mon_cmd));
die("Resource did not start within defined timeout. ($timeout sec).");
}
sleep 30;
}
}
=head2 check_takeover
check_takeover();
Checks takeover status and waits for finish until successful or reaches timeout.
=cut
sub check_takeover {
my ($self) = @_;
my $hostname = $self->{my_instance}->{instance_id};
die("Database on the fenced node '$hostname' is not offline") if ($self->is_hana_database_online);
die("System replication '$hostname' is not offline") if ($self->is_primary_node_online);
my $retry_count = 0;
TAKEOVER_LOOP: while (1) {
my $topology = $self->get_hana_topology();
$retry_count++;
while (my ($entry, $host_entry) = each %$topology) {
foreach (qw(vhost sync_state)) {
die("Missing '$_' field in topology output") unless defined(%$host_entry{$_}); }
my $vhost = %$host_entry{vhost};
my $sync_state = %$host_entry{sync_state};
record_info("Cluster Host", join("\n",
"vhost: $vhost compared with $hostname",
"sync_state: $sync_state compared with PRIM"));
if ($vhost ne $hostname && $sync_state eq "PRIM") {
record_info("Takeover status:", "Takeover complete to node '$vhost");
last TAKEOVER_LOOP;
}
}
die("Test failed: takeover failed to complete.") if ($retry_count > 40);
sleep 30;
}
return 1;
}
=head2 enable_replication
enable_replication([site_name => 'site_a']);
Enables replication on fenced database. Database needs to be offline.
=over
=item B<site_name> - site name of the site to register
=back
=cut
sub enable_replication {
my ($self, %args) = @_;
croak("Argument <site_name> missing") unless $args{site_name};
my $hostname = $self->{my_instance}->{instance_id};
die("Database on the fenced node '$hostname' is not offline") if ($self->is_hana_database_online);
die("System replication '$hostname' is not offline") if ($self->is_primary_node_online);
my $topology = $self->get_hana_topology();
foreach (qw(vhost remoteHost srmode op_mode)) { die "Missing '$_' field in topology output" unless defined(%$topology{$hostname}->{$_}); }
my $cmd = join(' ', 'hdbnsutil -sr_register',
'--name=' . $args{site_name},
'--remoteHost=' . %$topology{$hostname}->{remoteHost},
'--remoteInstance=00',
'--replicationMode=' . %$topology{$hostname}->{srmode},
'--operationMode=' . %$topology{$hostname}->{op_mode});
record_info('CMD Run', $cmd);
$self->run_cmd(cmd => $cmd, runas => get_required_var("SAP_SIDADM"));
}
=head2 get_replication_info
get_replication_info();
Parses "hdbnsutil -sr_state" command output.
Returns hash of found values converted to lowercase and replaces spaces to underscores.
=cut
sub get_replication_info {
my ($self) = @_;
my $output_cmd = $self->run_cmd(cmd => "hdbnsutil -sr_state| grep -E :[^\^]", runas => get_required_var("SAP_SIDADM"));
record_info("replication info", $output_cmd);
# Create a hash from hdbnsutil output, convert to lowercase with underscore instead of space.
my %out = $output_cmd =~ /^?\s?([\/A-z\s]*\S+):\s(\S+)\n/g;
%out = map { $_ =~ s/\s/_/g; lc $_ } %out;
return \%out;
}
=head2 get_promoted_instance
get_promoted_instance();
Retrieves hostname from currently promoted (Master) database and returns instance data from $self->{instances}.
=cut
sub get_promoted_instance {
my ($self) = @_;
my $instances = $self->{instances};
my $promoted;
# Identify Site A (Master) and Site B
foreach my $instance (@$instances) {
$self->{my_instance} = $instance;
my $instance_id = $instance->{'instance_id'};
# Skip instances without HANA db
next if ($instance_id !~ m/vmhana/);
my $promoted_id = $self->get_promoted_hostname();
$promoted = $instance if ($instance_id eq $promoted_id);
}
if ($promoted eq "undef" || !defined($promoted)) {
die("Failed to identify Hana 'PROMOTED' node");
}
return $promoted;
}
=head2 wait_for_sync
wait_for_sync([timeout => $timeout]);
Wait for replica site to sync data with primary.
Checks "SAPHanaSR-showAttr" output and ensures replica site has "sync_state" "SOK && PRIM" and no SFAIL.
Continue after expected output matched N times continually to make sure cluster is synced.
Expected conditions:
- Both primary and replica must be online.
- primary must have sync_state 'PRIM'
- primary must have clone_state 'PROMOTED'
- replica must have sync_state 'SOK' - this means data is in sync
- replica must have clone_state 'DEMOTED'
- site order does not matter
=over
=item B<timeout> - timeout for waiting sync state
=back
=cut
sub wait_for_sync {
my ($self, %args) = @_;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 900);
my $online_str = check_version('>=2.1.7', $self->pacemaker_version()) ? '[1-9]+' : 'online';
my $output_pass = 0;
record_info('Sync wait', "Waiting for data sync between nodes. online_str=$online_str timeout=$timeout");
# Check sync status periodically until ok for 5 times in a row or timeout
my $start_time = time;
while (time - $start_time < $timeout) {
# call SAPHanaSR-showAttr to get current topology, validate the output, calculate the score.
# Not ok cluster result in score reset to zero
$output_pass = check_hana_topology(input => $self->get_hana_topology(), node_state_match => $online_str) ? $output_pass + 1 : 0;
last if $output_pass == 5;
sleep 30;
}
if ($output_pass < 5) {
record_info("Cluster status", $self->run_cmd(cmd => $crm_mon_cmd));
record_info("Sync FAIL", "Host replication status: " . $self->run_cmd(cmd => 'SAPHanaSR-showAttr'));
die("Replication SYNC did not finish within defined timeout. ($timeout sec).");
}
}
=head2 wait_for_pacemaker
wait_for_pacemaker([timeout => $timeout]);
Checks status of pacemaker via systemd 'is-active' command an waits for startup.
=over
=item B<timeout> - timeout for waiting for pacemaker service
=back
=cut
sub wait_for_pacemaker {
my ($self, %args) = @_;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 300);
my $start_time = time;
my $systemd_cmd = "systemctl --no-pager is-active pacemaker";
my $pacemaker_state = "";
while ($pacemaker_state ne "active") {
sleep 15;
$pacemaker_state = $self->run_cmd(cmd => $systemd_cmd, proceed_on_failure => 1);
if (time - $start_time > $timeout) {
record_info("Pacemaker status", $self->run_cmd(cmd => "systemctl --no-pager status pacemaker"));
die("Pacemaker did not start within defined timeout");
}
}
return 1;
}
=head2 change_sbd_service_timeout
$self->change_sbd_service_timeout(service_timeout => '30');
Overrides timeout for sbd systemd service to a value provided by argument.
This is done by creating or changing file "/etc/systemd/system/sbd.service.d/sbd_delay_start.conf"
=over
=item B<service_timeout> - value for the TimeoutSec setting
=back
=cut
sub change_sbd_service_timeout() {
my ($self, %args) = @_;
croak("Argument <service_timeout> missing") unless $args{service_timeout};
my $service_override_dir = "/etc/systemd/system/sbd.service.d/";
my $service_override_filename = "sbd_delay_start.conf";
my $service_override_path = $service_override_dir . $service_override_filename;
my $file_exists = $self->run_cmd(cmd => join(" ", "test", "-e", $service_override_path, ";echo", "\$?"),
proceed_on_failure => 1,
quiet => 1);
# bash return code has inverted value: 0 = file exists
if (!$file_exists) {
$self->cloud_file_content_replace(filename => $service_override_path,
search_pattern => '^TimeoutSec=.*',
replace_with => "TimeoutSec=$args{service_timeout}");
}
else {
my @content = ('[Service]', "TimeoutSec=$args{service_timeout}");
$self->run_cmd(cmd => join(" ", "mkdir", "-p", $service_override_dir), quiet => 1);
$self->run_cmd(cmd => join(" ", "bash", "-c", "\"echo", "'$_'", ">>", $service_override_path, "\""), quiet => 1) foreach @content;
}
record_info("Systemd SBD", "Systemd unit timeout for 'sbd.service' set to '$args{service_timeout}'");
}
=head2 setup_sbd_delay_publiccloud
$self->setup_sbd_delay_publiccloud();
Set (activate or deactivate) SBD_DELAY_START setting in /etc/sysconfig/sbd.
Delay is used in case of cluster VM joining cluster too quickly after fencing operation.
For more information check sbd man page.
Setting is changed via OpenQA parameter: HA_SBD_START_DELAY
Possible values:
"no" - do not set and turn off SBD delay time
"yes" - sets default SBD value which is calculated from a formula
"<number of seconds>" - sets specific delay in seconds
Returns integer representing wait time.
=cut
sub setup_sbd_delay_publiccloud() {
my ($self) = @_;
my $delay = get_var('HA_SBD_START_DELAY') // '';
if ($delay eq '') {
record_info('SBD delay', 'Skipping, parameter without value');
# Ensure service timeout is higher than sbd delay time
$delay = $self->sbd_delay_formula();
$self->change_sbd_service_timeout(service_timeout => $delay + 30);
}
else {
$delay =~ s/(?<![ye])s//g;
croak("<\$set_delay> value must be either 'yes', 'no' or an integer. Got value: $delay")
unless looks_like_number($delay) or grep /^$delay$/, qw(yes no);
$self->cloud_file_content_replace(filename => '/etc/sysconfig/sbd', search_pattern => '^SBD_DELAY_START=.*', replace_with => "SBD_DELAY_START=$delay");
# service timeout must be higher that startup delay
$self->change_sbd_service_timeout(service_timeout => $self->sbd_delay_formula() + 30);
record_info('SBD delay', "SBD delay set to: $delay");
}
return $delay;
}
=head2 sbd_delay_formula
$self->sbd_delay_formula();
return calculated sbd delay
=cut
sub sbd_delay_formula() {
my ($self) = @_;
# all commands below ($corosync_token, $corosync_consensus...)
# are defined and imported from lib/hacluster.pm
my %params = (
'corosync_token' => $self->run_cmd(cmd => $corosync_token),
'corosync_consensus' => $self->run_cmd(cmd => $corosync_consensus),
'sbd_watchdog_timeout' => $self->run_cmd(cmd => $sbd_watchdog_timeout),
'sbd_delay_start' => $self->run_cmd(cmd => $sbd_delay_start),
'pcmk_delay_max' => get_var('FENCING_MECHANISM') eq 'sbd' ?
$self->run_cmd(cmd => $pcmk_delay_max) : 30
);
my $calculated_delay = calculate_sbd_start_delay(\%params);
record_info('SBD wait', "Calculated SBD start delay: $calculated_delay");
return $calculated_delay;
}
=head2 cloud_file_content_replace
cloud_file_content_replace(filename => $filename, search_pattern => $search_pattern, replace_with => $replace_with);
Replaces file content direct on PC SUT. Similar to lib/utils.pm file_content_replace()
=over
=item B<filename> - file location
=item B<search_pattern> - search pattern
=item B<replace_with> - string to replace
=back
=cut
sub cloud_file_content_replace() {
my ($self, %args) = @_;
foreach (qw(filename search_pattern replace_with)) {
croak("Argument < $_ > missing") unless $args{$_}; }
$self->run_cmd(cmd => sprintf("sed -E 's/%s/%s/g' -i %s", $args{search_pattern}, $args{replace_with}, $args{filename}), quiet => 1);
}
=head2 create_instance_data
Create and populate a list of publiccloud::instance and publiccloud::provider compatible
class instances.
=over
=item B<provider> - Instance of PC object "provider", the one usually created by provider_factory()
=back
=cut
sub create_instance_data {
my (%args) = @_;
croak("Argument <provider> missing") unless $args{provider};
my $class_type = ref($args{provider});
croak("Unexpected class type [$class_type]") unless $class_type =~ /^publiccloud::(azure|ec2|gce)/;
my @instances = ();
my $inventory_file = qesap_get_inventory(provider => get_required_var('PUBLIC_CLOUD_PROVIDER'));
my $ypp = YAML::PP->new;
my $raw_file = script_output("cat $inventory_file");
my $inventory_data = $ypp->load_string($raw_file)->{all}{children};
for my $type_label (keys %$inventory_data) {
my $type_data = $inventory_data->{$type_label}{hosts};
for my $vm_label (keys %$type_data) {
my $instance = publiccloud::instance->new(
public_ip => $type_data->{$vm_label}->{ansible_host},
instance_id => $vm_label,
username => get_required_var('PUBLIC_CLOUD_USER'),
ssh_key => get_ssh_private_key_path(),
provider => $args{provider},
region => $args{provider}->provider_client->region,
type => get_required_var('PUBLIC_CLOUD_INSTANCE_TYPE'),
image_id => $args{provider}->get_image_id());
push @instances, $instance;
}
}
publiccloud::instances::set_instances(@instances);
return \@instances;
}
=head2 deployment_name
Return a string to be used as value for the deployment_name variable
in the qe-sap-deployment.
=cut
sub deployment_name {
return qesap_calculate_deployment_name(get_var('PUBLIC_CLOUD_RESOURCE_GROUP', 'qesaposd'));
}
=head2 delete_network_peering
Delete network peering between SUT created with qe-sa-deployment
and the IBS Mirror. Function is generic over all the Cloud Providers
=cut
sub delete_network_peering {
record_info('Peering cleanup', 'Executing peering cleanup (if peering is present)');
if (is_azure) {
# Check that required vars are available before deleting the peering
my $rg = qesap_az_get_resource_group();
if (get_var('IBSM_RG')) {
qesap_az_vnet_peering_delete(source_group => $rg, target_group => get_var('IBSM_RG'));
}
else {
record_info('No peering', 'No peering exists, peering destruction skipped');
}
}
elsif (is_ec2) {
qesap_aws_delete_transit_gateway_vpc_attachment(name => deployment_name() . '*');
}
}
=head2 create_ansible_playbook_list
Detects HANA/HA scenario from function arguments and returns a list of ansible playbooks to include
in the "ansible: create:" section of config.yaml file.
=over
=item B<ha_enabled> - Enable the installation of HANA and the cluster configuration
=item B<registration> - select registration mode, possible values are
* registercloudguest (default)
* suseconnect
* noreg skip scheduling of register.yaml at all
=item B<scc_code> - registration code
=item B<ltss> - name and reg_code for LTSS extension to register.
This argument is a two element comma separated list string.
Like: 'SLES-LTSS-Extended-Security/12.5/x86_64,123456789'
First string before the comma has to be a valid SCC extension name, later used by Ansible
as argument for SUSEConnect or registercloudguest argument.
Second string has to be valid registration code for the particular LTSS extension.
=item B<fencing> - select fencing mechanism
=item B<fence_type> - select Azure native fencing mechanism. Only two accepted values 'spn' or 'msi'. This argument is only applicable to Azure. (optional)
=item B<spn_application_id> - Application ID for the SPN Azure native fencing agent.This argument is only applicable to Azure configured with native fencing of type SPN. (optional)
=item B<spn_application_password> - password for the SPN Azure native fencing agent.This argument is only applicable to Azure configured with native fencing of type SPN. (optional)
=item B<ptf_files> - list of PTF files (optional)
=item B<ptf_token> - SAS token to access the PTF files (optional)
=item B<ptf_account> - name of the account for the ptf container (optional)
=item B<ptf_container> - name of the container for PTF files (optional)
=back
=cut
sub create_playbook_section_list {
my (%args) = @_;
$args{ha_enabled} //= 1;
$args{registration} //= 'registercloudguest';
$args{fencing} //= 'sbd';
$args{scc_code} //= '';
if ($args{fencing} eq 'native' and is_azure) {
croak "Argument <fence_type> missing" unless $args{fence_type};
}
if ($args{fencing} eq 'native' and is_azure and $args{fence_type} eq 'spn') {
croak "Argument <spn_application_id> missing" unless $args{spn_application_id};
croak "Argument <spn_application_password> missing" unless $args{spn_application_password};
}
my @playbook_list;
unless ($args{registration} eq 'noreg') {
my @reg_args = ('registration.yaml');
push @reg_args, "-e reg_code=$args{scc_code} -e email_address=''";
push @reg_args, '-e use_suseconnect=true' if ($args{registration} eq 'suseconnect');
push @reg_args, qesap_ansible_reg_module(reg => $args{ltss}) if ($args{ltss});
# Add registration module as first element
push @playbook_list, join(' ', @reg_args);
# Add "fully patch system" module after registration module and before test start/configuration modules.
# Temporary moved inside noreg condition to avoid test without Ansible to fails.
# To be properly addressed in the caller and fully-patch-system can be placed back out of the if.
push @playbook_list, 'fully-patch-system.yaml';
}
# Add playbook to download and install PTFs, if any
if ($args{ptf_files} && $args{ptf_token} && $args{ptf_container} && $args{ptf_account}) {
push @playbook_list, join(' ',
'ptf_installation.yaml',
"-e ptf_files=$args{ptf_files}",
"-e sas_token='$args{ptf_token}'",
"-e container=$args{ptf_container}",
"-e storage=$args{ptf_account}");
push @playbook_list, "additional_fence_agent_tasks.yaml";
}
my $hana_cluster_playbook = 'sap-hana-cluster.yaml';
if ($args{fencing} eq 'native' and is_azure) {
# Prepares Azure native fencing related arguments for 'sap-hana-cluster.yaml' playbook
my $azure_native_fencing_args = azure_fencing_agents_playbook_args(
fence_type => $args{fence_type},
spn_application_id => $args{spn_application_id},
spn_application_password => $args{spn_application_password}
);
$hana_cluster_playbook = join(' ', $hana_cluster_playbook, $azure_native_fencing_args);
}
# SLES4SAP/HA related playbooks
if ($args{ha_enabled}) {
push @playbook_list, 'pre-cluster.yaml', 'sap-hana-preconfigure.yaml -e use_sapconf=' . get_var('USE_SAPCONF', 'false');
push @playbook_list, 'cluster_sbd_prep.yaml' if ($args{fencing} eq 'sbd');
push @playbook_list, qw(
sap-hana-storage.yaml
sap-hana-download-media.yaml
sap-hana-install.yaml
sap-hana-system-replication.yaml
sap-hana-system-replication-hooks.yaml
);
push @playbook_list, $hana_cluster_playbook;
}
return (\@playbook_list);
}
=head2 azure_fencing_agents_playbook_args
azure_fencing_agents_playbook_args(
fence_type => 'spn'
spn_application_id=>$spn_application_id,
spn_application_password=>$spn_application_password