-
Notifications
You must be signed in to change notification settings - Fork 283
/
Copy pathhacluster.pm
1519 lines (1117 loc) · 45.9 KB
/
hacluster.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SUSE's openQA tests
#
# Copyright 2016-2020 SUSE LLC
# SPDX-License-Identifier: FSFAP
#
# Summary: Functions for HA Cluster tests
package hacluster;
use base Exporter;
use Exporter;
use strict;
use warnings;
use version_utils qw(is_sle);
use Scalar::Util qw(looks_like_number);
use utils;
use testapi;
use lockapi;
use isotovideo;
use maintenance_smelt qw(get_incident_packages);
use x11utils qw(ensure_unlocked_desktop);
use Utils::Logging qw(export_logs);
use Carp qw(croak);
use Data::Dumper;
our @EXPORT = qw(
$crm_mon_cmd
$softdog_timeout
$join_timeout
$default_timeout
$corosync_token
$corosync_consensus
$sbd_watchdog_timeout
$sbd_delay_start
$pcmk_delay_max
exec_csync
add_file_in_csync
get_cluster_info
get_cluster_name
get_hostname
get_ip
get_my_ip
get_node_to_join
get_node_number
get_node_index
is_node
add_to_known_hosts
choose_node
save_state
is_package_installed
check_rsc
ensure_process_running
ensure_resource_running
ensure_dlm_running
write_tag
read_tag
block_device_real_path
lvm_add_filter
lvm_remove_filter
rsc_cleanup
ha_export_logs
check_cluster_state
wait_until_resources_stopped
wait_until_resources_started
wait_for_idle_cluster
get_lun
check_device_available
set_lvm_config
add_lock_mgr
pre_run_hook
post_run_hook
post_fail_hook
test_flags
is_not_maintenance_update
activate_ntp
script_output_retry_check
calculate_sbd_start_delay
setup_sbd_delay
set_sbd_service_timeout
collect_sbd_delay_parameters
check_iscsi_failure
cluster_status_matches_regex
crm_wait_for_maintenance
crm_check_resource_location
generate_lun_list
show_cluster_parameter
set_cluster_parameter
);
=head1 SYNOPSIS
Library with common methods and default values for High Availability
Extension (HA or HAE) tests.
=cut
=head2 Global Variables
=over
=item * B<$default_timeout>: default scaled timeout for most operations with SUT
=item * B<$join_timeout>: default scaled timeout for C<ha-cluster-join> calls
=item * B<$softdog_timeout>: default scaled timeout for the B<softdog> watchdog
=item * B<$crm_mon_cmd>: crm_mon (crm monitoring) command
=item * B<$corosync_token>: command to filter the value of C<runtime.config.totem.token> from the output of C<corosync-cmapctl>
=item * B<$corosync_consensus>: command to filter the value of C<runtime.config.totem.consensus> from the output of C<corosync-cmapctl>
=item * B<$sbd_watchdog_timeout>: command to extract the value of C<SBD_WATCHDOG_TIMEOUT> from C</etc/sysconfig/sbd>
=item * B<$sbd_delay_start>: command to extract the value of C<SBD_DELAY_START> from C</etc/sysconfig/sbd>
=item * B<$pcmk_delay_max>: command to get the value of the C<pcmd_delay_max> parameter from the STONITH resource in the cluster configuration.
=back
=cut
our $crm_mon_cmd = 'crm_mon -R -r -n -N -1';
our $softdog_timeout = bmwqemu::scale_timeout(60);
our $prev_console;
our $join_timeout = bmwqemu::scale_timeout(60);
our $default_timeout = bmwqemu::scale_timeout(30);
our $corosync_token = q@corosync-cmapctl | awk -F " = " '/runtime.config.totem.token\s/ {print int($2/1000)}'@;
our $corosync_consensus = q@corosync-cmapctl | awk -F " = " '/runtime.config.totem.consensus\s/ {print int($2/1000)}'@;
our $sbd_watchdog_timeout = q@grep -oP '(?<=^SBD_WATCHDOG_TIMEOUT=)[[:digit:]]+' /etc/sysconfig/sbd@;
our $sbd_delay_start = q@grep -oP '(?<=^SBD_DELAY_START=)([[:digit:]]+|yes|no)+' /etc/sysconfig/sbd@;
our $pcmk_delay_max = q@crm resource param stonith-sbd show pcmk_delay_max| sed 's/[^0-9]*//g'@;
# Private functions
sub _just_the_ip {
my $node_ip = shift;
if ($node_ip =~ /(\d+\.\d+\.\d+\.\d+)/) {
return $1;
}
return 0;
}
sub _test_var_defined {
my $var = shift;
die 'A command in ' . (caller(1))[3] . ' did not return a defined value!' unless defined $var;
}
# Public functions
=head2 exec_csync
exec_csync();
Runs C<csync2 -vxF> in the SUT, to sync files from SUT to other nodes in the
cluster. Sometimes it is expected that the first call to C<csync2 -vxF> fails,
so this method will run the command twice.
=cut
sub exec_csync {
# Sometimes we need to run csync2 twice to have all the files updated!
assert_script_run 'csync2 -vxF ; sleep 2 ; csync2 -vxF';
}
=head2 add_file_in_csync
add_file_in_csync( value => '/path/to/file', [ conf_file => '/path/to/csync2.cfg' ] );
Adds F</path/to/file> to a csync2 configuration file in SUT. Path to add must be passed
with the named argument B<value>, while csync2 configuration file can be passed on the
named argument B<conf_file> (defaults to F</etc/csync2/csync2.cfg>). Returns true on
success or croaks if command execution fails in SUT.
=cut
sub add_file_in_csync {
my %args = @_;
my $conf_file = $args{conf_file} // '/etc/csync2/csync2.cfg';
if (defined($conf_file) && defined($args{value})) {
# Check if conf_file is a valid value
assert_script_run "[[ -w $conf_file ]]";
# Add the value in conf_file and sync on all nodes
assert_script_run "grep -Fq $args{value} $conf_file || sed -i 's|^}\$|include $args{value};\\n}|' $conf_file";
exec_csync;
}
return 1;
}
=head2 get_cluster_info
get_cluster_info();
Returns a hashref containing the info parsed from the CLUSTER_INFOS variable.
This does not reflect the current state of the cluster but the intended steady
state once the LUNs are configured and the nodes have joined.
=cut
sub get_cluster_info {
my ($cluster_name, $num_nodes, $num_luns) = split(/:/, get_required_var('CLUSTER_INFOS'));
return {cluster_name => $cluster_name, num_nodes => $num_nodes, num_luns => $num_luns};
}
=head2 get_cluster_name
get_cluster_name();
Returns the cluster name, as defined in the B<CLUSTER_NAME> setting. Croaks if the
setting is not defined, as it is a mandatory setting for HA tests.
=cut
sub get_cluster_name {
return get_required_var('CLUSTER_NAME');
}
=head2 get_hostname
get_hostname();
Returns the hostname, as defined in the B<HOSTNAME> setting. Croaks if the setting
is not defined, as it is a mandatory setting for HA tests.
=cut
sub get_hostname {
return get_required_var('HOSTNAME');
}
=head2 get_node_to_join
get_node_to_join();
Returns the hostname of the node to join, as defined in the B<HA_CLUSTER_JOIN>
setting. Croaks if the setting is not defined, as this setting is mandatory for
all nodes that run C<ha-cluster-join>. As such, avoid scheduling tests that
call this method on nodes that would run C<ha-cluster-init> instead.
=cut
sub get_node_to_join {
return get_required_var('HA_CLUSTER_JOIN');
}
=head2 get_ip
get_ip( $node_hostname );
Returns the IP address of a node given its hostname, either by calling the
C<host> command in SUT (which in turns would do a DNS query on tests using
support server), or by searching for the host entry in SUT's F</etc/hosts>.
Returns 0 on failure.
=cut
sub get_ip {
my $node_hostname = shift;
my $node_ip;
if (get_var('USE_SUPPORT_SERVER')) {
$node_ip = script_output_retry("host -t A $node_hostname", retry => 3, delay => 5);
}
else {
$node_ip = script_output("awk 'BEGIN {RET=1} /$node_hostname/ {print \$1; RET=0; exit} END {exit RET}' /etc/hosts");
}
return _just_the_ip($node_ip);
}
=head2 get_my_ip
get_my_ip();
Returns the IP address of SUT or 0 if the address cannot be determined. Special case of C<get_ip()>.
=cut
sub get_my_ip {
my $netdevice = get_var('SUT_NETDEVICE', 'eth0');
my $node_ip = script_output "ip -4 addr show dev $netdevice | sed -rne '/inet/s/[[:blank:]]*inet ([0-9\\.]*).*/\\1/p'";
return _just_the_ip($node_ip);
}
=head2 get_node_number
get_node_number();
Returns the number of nodes configured in the cluster.
=cut
sub get_node_number {
my $index = is_sle('15-sp2+') ? 2 : 1;
return script_output "crm_mon -1 | awk '/ nodes configured/ { print \$$index }'";
}
=head2 get_node_index
get_node_index();
Returns the index number of the SUT. This information is taken from the
node hostnames, so be sure to define proper hostnames in the tests settings,
for example B<alpha-node01>, B<alpha-node02>, etc.
=cut
sub get_node_index {
my $node_index = get_hostname;
$node_index =~ s/.*([0-9][0-9])$/$1/;
return int($node_index);
}
=head2 is_node
is_node( $node_number );
Checks whether SUT is the node identified by B<$node_number>. Returns true or false.
This information is matched against the node hostname, so be sure to define proper
hostnames in the tests settings, for example B<alpha-node01>, B<alpha-node02>, etc.
=cut
sub is_node {
my $node_number = shift;
my $hostname = get_hostname;
# Node number must be coded with 2 digits
$node_number = sprintf("%02d", $node_number);
# Return true if HOSTNAME contains $node_number at his end
return ($hostname =~ /$node_number$/);
}
=head2 add_to_known_hosts
add_to_known_hosts( $host );
Adds B<$host> to the F<.ssh/known_hosts> file of the current user in SUT.
Croaks if any of the commands to do so fail.
=cut
sub add_to_known_hosts {
my $host_to_add = shift;
assert_script_run "mkdir -p ~/.ssh";
assert_script_run "chmod 700 ~/.ssh";
assert_script_run "ssh-keyscan -H $host_to_add >> ~/.ssh/known_hosts";
}
=head2 choose_node
choose_node( $node_number );
Returns the hostname of the node identified by B<$node_number>. This information
relies on the node hostnames, so be sure to define proper hostnames in the tests
settings, for example B<alpha-node01>, B<alpha-node02>, etc.
=cut
sub choose_node {
my $node_number = shift;
my $tmp_hostname = get_hostname;
# Node number must be coded with 2 digits
$node_number = sprintf("%02d", $node_number);
# Replace the digit of HOSTNAME to create the new hostname
$tmp_hostname =~ s/(.*)[0-9][0-9]$/$1$node_number/;
# And return it
return $tmp_hostname;
}
=head2 save_state
save_state();
Prints the cluster configuration and cluster status in SUT, and saves the
screenshot.
=cut
sub save_state {
script_run 'yes | crm configure show', $default_timeout;
assert_script_run "$crm_mon_cmd", $default_timeout;
save_screenshot;
}
=head2 is_package_installed
is_package_installed( $package );
Checks if B<$package> is installed in SUT. Returns true or false.
=cut
sub is_package_installed {
my $package = shift;
my $ret = script_run "rpm -q $package";
_test_var_defined $ret;
return ($ret == 0);
}
=head2 check_rsc
check_rsc( $resource );
Checks if cluster resource B<$resource> is configured in the cluster. Returns
true or false.
=cut
sub check_rsc {
my $rsc = shift;
my $ret = script_run "grep -q '\\<$rsc\\>' <($crm_mon_cmd 2>/dev/null)";
_test_var_defined $ret;
return ($ret == 0);
}
=head2 ensure_process_running
ensure_process_running( $process );
Checks for up to B<$default_timeout> seconds whether process B<$process> is
running in SUT. Returns 0 if process is running or croaks on timeout.
=cut
sub ensure_process_running {
my $process = shift;
my $starttime = time;
my $ret = undef;
while ($ret = script_run "ps -A | grep -q '\\<$process\\>'") {
my $timerun = time - $starttime;
if ($timerun < $default_timeout) {
sleep 5;
}
else {
die "Process '$process' did not start within $default_timeout seconds";
}
}
# script_run need to be defined to ensure a correct exit code
_test_var_defined $ret;
return $ret;
}
=head2 ensure_resource_running
ensure_resource_running( $resource, $regexp );
Checks for up to B<$default_timeout> seconds in the output of
C<crm resource status $resource> if a resource B<$resource> is configured in
the cluster; uses B<$regexp> to check. Returns 0 on success or croaks on timeout.
=cut
sub ensure_resource_running {
my ($rsc, $regex) = @_;
my $starttime = time;
my $ret = undef;
while ($ret = script_run("grep -E -q '$regex' <(crm resource status $rsc)", $default_timeout)) {
my $timerun = time - $starttime;
if ($timerun < $default_timeout) {
sleep 5;
}
else {
die "Resource '$rsc' did not start within $default_timeout seconds";
}
}
# script_run need to be defined to ensure a correct exit code
_test_var_defined $ret;
return $ret;
}
=head2 ensure_dlm_running
ensure_dlm_running();
Checks that the C<dlm> resource is running in the cluster, and that its
associated process (B<dlm_controld>) is running in SUT. Returns 0 if
process is running or croaks on error.
=cut
sub ensure_dlm_running {
die 'dlm is not running' unless check_rsc "dlm";
return ensure_process_running 'dlm_controld';
}
=head2 write_tag
write_tag( $tag );
Create a cluster-specific file in F</tmp/> of the SUT with B<$tag> as its content.
Returns 0 on success or croaks on failure.
=cut
sub write_tag {
my $tag = shift;
my $rsc_tag = '/tmp/' . get_cluster_name . '.rsc';
my $ret = script_run "echo $tag > $rsc_tag";
_test_var_defined $ret;
return ($ret == 0);
}
=head2 read_tag
read_tag();
Read the content of the cluster-specific file created in F</tmp/> with
C<write_tag()>. Returns the content of the file or croaks on failure.
=cut
sub read_tag {
my $rsc_tag = '/tmp/' . get_cluster_name . '.rsc';
return script_output "cat $rsc_tag 2>/dev/null";
}
=head2 block_device_real_path
block_device_real_path( $device );
Returns the real path of the block device specified by B<$device> as shown
by C<realpath -ePL>, or croak on failure.
=cut
sub block_device_real_path {
my $lun = shift;
return script_output "realpath -ePL $lun";
}
=head2 lvm_add_filter
lvm_add_filter( $type, $filter );
Add filter B<$filter> of type B<$type> to F</etc/lvm/lvm.conf>.
=cut
sub lvm_add_filter {
my ($type, $filter) = @_;
my $lvm_conf = '/etc/lvm/lvm.conf';
assert_script_run "sed -ie '/^[[:blank:]][[:blank:]]*filter/s;\\[[[:blank:]]*;\\[ \"$type|$filter|\", ;' $lvm_conf";
}
=head2 lvm_remove_filter
lvm_remove_filter( $filter );
Remove filter B<$filter> from F</etc/lvm/lvm.conf>.
=cut
sub lvm_remove_filter {
my $filter = shift;
my $lvm_conf = '/etc/lvm/lvm.conf';
assert_script_run "sed -ie '/^[[:blank:]][[:blank:]]*filter/s;$filter;;' $lvm_conf";
}
=head2 rsc_cleanup
rsc_cleanup( $resource );
Execute a C<crm resource cleanup> on the resource identified by B<$resource>.
=cut
sub rsc_cleanup {
my $rsc = shift;
assert_script_run "crm resource cleanup $rsc";
my $ret = script_run "crm_mon -1 2>/dev/null | grep -Eq \"$rsc.*'not configured'|$rsc.*exit\"";
if (defined $ret and $ret == 0) {
# Resource is not cleared, so we need to force cleanup
# Record a soft failure for this, as a bug is opened
record_soft_failure 'bsc#1071503';
assert_script_run "crm_resource -R -r $rsc";
}
}
=head2 ha_export_logs
ha_export_logs();
Upload HA-relevant logs from SUT. These include: crm configuration, cluster
bootstrap log, corosync configuration, B<crm report>, list of installed packages,
list of iSCSI devices, F</etc/mdadm.conf>, support config and B<y2logs>. If available,
logs from the B<HAWK> test, from B<CTS> and from B<HANA> are also included.
=cut
sub ha_export_logs {
my $bootstrap_log = '/var/log/ha-cluster-bootstrap.log';
my $corosync_conf = '/etc/corosync/corosync.conf';
my $crm_log = '/var/log/crm_report';
my $packages_list = '/tmp/packages.list';
my $iscsi_devs = '/tmp/iscsi_devices.list';
my $mdadm_conf = '/etc/mdadm.conf';
my $clustername = get_cluster_name;
my $report_opt = !is_sle('12-sp4+') ? '-f0' : '';
my $cts_log = '/tmp/cts_cluster_exerciser.log';
my @y2logs;
select_console 'root-console';
# Extract HA logs and upload them
script_run "touch $corosync_conf";
script_run "crm report $report_opt -E $bootstrap_log $crm_log", 300;
upload_logs("$bootstrap_log", failok => 1);
my $crm_log_name = script_output("ls $crm_log* | tail -1");
upload_logs("$crm_log_name", failok => 1);
script_run "crm configure show > /tmp/crm.txt";
upload_logs('/tmp/crm.txt');
# Extract YaST logs and upload them
upload_y2logs(failok => 1) if is_sle('<16');
# Generate the packages list
script_run "rpm -qa > $packages_list";
upload_logs("$packages_list", failok => 1);
# iSCSI devices and their real paths
script_run "ls -l /dev/disk/by-path/ > $iscsi_devs";
upload_logs($iscsi_devs, failok => 1);
# mdadm conf
script_run "touch $mdadm_conf";
upload_logs($mdadm_conf, failok => 1);
# supportconfig
enter_cmd "supportconfig -g -B $clustername; echo DONE-\$?- > /dev/$serialdev";
my $ret = wait_serial qr/DONE-\d+-/, timeout => 300;
# Make it softfail for not blocking qem bot auto approvals on 12-SP5
# Command 'supportconfig' hangs on 12-SP5, wait_serial times out and returns 'undef'
if (!defined($ret) && is_sle("=12-SP5")) {
record_soft_failure 'poo#151612';
# Send 'ctrl-c' to kill 'supportconfig' as it hangs
send_key('ctrl-c');
}
upload_logs("/var/log/scc_$clustername.tgz", failok => 1);
# pacemaker cts log
upload_logs($cts_log, failok => 1) if (get_var('PACEMAKER_CTS_TEST_ROLE'));
# HAWK test logs if present
upload_logs("/tmp/hawk_test.log", failok => 1);
upload_logs("/tmp/hawk_test.ret", failok => 1);
# HANA hdbnsutil logs
if (check_var('CLUSTER_NAME', 'hana')) {
script_run 'tar -zcf /tmp/trace.tgz $(find /hana/shared -name nameserver_*.trc)';
upload_logs('/tmp/trace.tgz', failok => 1);
}
}
=head2 check_cluster_state
check_cluster_state( [ proceed_on_failure => 1 ] );
Checks the state of the cluster. Calls B<$crm_mon_cmd> and inspects its output checking:
=over 3
=item The current state of the cluster.
=item Inactive resources.
=item S<partition with quorum>
=back
Checks that the reported number of nodes in the output of C<crm node list> and B<$crm_mon_cmd>
is the same.
And runs C<crm_verify -LV>.
With the named argument B<proceed_on_failure> set to 1, the function will use
B<script_run()> and attempt to run all commands in SUT without checking for errors.
Without it, the method uses B<assert_script_run()> and will croak on failure.
=cut
sub check_cluster_state {
my %args = @_;
# We may want to check cluster state without stopping the test
my $cmd = (defined $args{proceed_on_failure} && $args{proceed_on_failure} == 1) ? \&script_run : \&assert_script_run;
$cmd->("$crm_mon_cmd");
if (is_sle '12-sp3+') {
# Add sleep as command 'crm_mon' outputs 'Inactive resources:' instead of 'no inactive resources' on 12-sp5
sleep 5;
$cmd->("$crm_mon_cmd | grep -i 'no inactive resources'");
}
$cmd->('crm_mon -1 | grep \'partition with quorum\'');
# In older versions, node names in crm node list output are followed by ": normal". In newer ones by ": member"
$cmd->(q/crm_mon -s | grep "$(crm node list | grep -E -c ': member|: normal') nodes online"/);
# As some options may be deprecated, test shouldn't die on 'crm_verify'
if (get_var('HDDVERSION')) {
script_run 'crm_verify -LV';
}
else {
$cmd->('crm_verify -LV');
}
}
=head2 wait_until_resources_stopped
wait_until_resources_stopped( [ timeout => $timeout, minchecks => $tries ] );
Wait for resources to be stopped. Runs B<$crm_mon_cmd> until there are no resources
in B<stopping> state or up to B<$timeout> seconds. Timeout must be specified by the
named argument B<timeout> (defaults to 120 seconds). This timeout is scaled by the
factor specified in the B<TIMEOUT_SCALE> setting. The named argument B<minchecks>
(defaults to 3, can be disabled with 0) provides a minimum number of times to check
independently of the return status; this helps avoid race conditions where the method
checks before the HA stack starts to stop the resources. Croaks on timeout.
=cut
sub wait_until_resources_stopped {
my %args = @_;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 120);
my $ret = undef;
my $starttime = time;
my $minchecks = $args{minchecks} // 3;
do {
$ret = script_run "! ($crm_mon_cmd | grep -Eioq ':[[:blank:]]*stopping')", $default_timeout;
# script_run need to be defined to ensure a correct exit code
_test_var_defined $ret;
my $timerun = time - $starttime;
--$minchecks if ($minchecks);
if ($timerun < $timeout) {
sleep 5;
}
else {
die "Cluster/resources did not stop within $timeout seconds";
}
} while ($minchecks || $ret);
}
=head2 wait_until_resources_started
wait_until_resources_started( [ timeout => $timeout ] );
Wait for resources to be started. Runs C<crm cluster wait_for_startup> in SUT as well
as other verifications on newer versions of SLES (12-SP3+), for up to B<$timeout> seconds
for each command. Timeout must be specified by the named argument B<timeout> (defaults
to 120 seconds). This timeout is scaled by the factor specified in the B<TIMEOUT_SCALE>
setting. Croaks on timeout.
=cut
# If changing this, remember to also change wait_until_resources_started in tests/publiccloud/sles4sap.pm
sub wait_until_resources_started {
my %args = @_;
my @cmds = ('crm cluster wait_for_startup');
my $timeout = bmwqemu::scale_timeout($args{timeout} // 120);
my $ret = undef;
# Some CRM options can only been added on recent versions
push @cmds, "grep -iq 'no inactive resources' <($crm_mon_cmd)" if is_sle '12-sp3+';
push @cmds, "! (grep -Eioq ':[[:blank:]]*failed|:[[:blank:]]*starting' <($crm_mon_cmd))";
# Execute each command to validate that the cluster is running
# This can takes time, so a loop is a good idea here
foreach my $cmd (@cmds) {
# Each command execution has its own timeout, so we need to reset the counter
my $starttime = time;
# Check for cluster/resources status and exit loop when needed
while ($ret = script_run("$cmd", $default_timeout)) {
# Otherwise wait a while if timeout is not reached
my $timerun = time - $starttime;
if ($timerun < $timeout) {
sleep 5;
}
else {
record_info('Cluster status', script_output("$crm_mon_cmd"));
save_state();
die "Cluster/resources did not start within $timeout seconds (cmd='$cmd')";
}
}
# script_run need to be defined to ensure a correct exit code
_test_var_defined $ret;
}
}
=head2 wait_for_idle_cluster
wait_for_idle_cluster( [ timeout => $timeout ] );
Use C<cs_wait_for_idle> to wait until the cluster is idle before continuing the tests.
Supply a timeout with the named argument B<timeout> (defaults to 120 seconds). This
timeout is scaled by the factor specified in the B<TIMEOUT_SCALE> setting. Croaks on
timeout.
=cut
sub wait_for_idle_cluster {
my %args = @_;
my $timeout = bmwqemu::scale_timeout($args{timeout} // 120);
my $outoftime = time() + $timeout; # Current time plus timeout == time at which timeout will be reached
return if script_run 'rpm -q ClusterTools2'; # cs_wait_for_idle only present if ClusterTools2 is installed
while (1) {
my $out = script_output 'cs_wait_for_idle --sleep 5', $timeout;
last if ($out =~ /Cluster state: S_IDLE/);
sleep 5;
die "Cluster was not idle for $timeout seconds" if (time() >= $outoftime);
}
}
=head2 get_lun
get_lun( [ use_once => $bool ] );
Returns a LUN from the LUN list file stored in the support server or in the support
NFS share in scenarios without support server. If the named argument B<use_once>
is passed and set to true (defaults to true), the returned LUN will be removed from
the file, so it will not be selected again. Croaks on failure.
=cut
# This function returns the first available LUN
sub get_lun {
my %args = @_;
my $hostname = get_hostname;
my $cluster_name = get_cluster_name;
my $lun_list_file = '/tmp/' . $cluster_name . '-lun.list';
my $use_once = $args{use_once} // 1;
my $supportdir = get_var('NFS_SUPPORT_DIR', '/mnt');
# Use mutex to be sure that only *one* node at a time can access the file
mutex_lock 'iscsi';
# Get the LUN file from the support server to have an up-to-date version
if (get_var('USE_SUPPORT_SERVER')) {
exec_and_insert_password "scp -o StrictHostKeyChecking=no root\@ns:$lun_list_file $lun_list_file";
}
else {
assert_script_run "cp $supportdir/$cluster_name-lun.list $lun_list_file";
}
# Extract the first *free* line for this server
my $lun = script_output "grep -Fv '$hostname' $lun_list_file | awk 'NR==1 { print \$1 }'";
# Die if no LUN is found
die "No LUN found in $lun_list_file" if (!length $lun);
if ($use_once) {
# Remove LUN if needed
my $tmp_lun = $lun;
$tmp_lun =~ s/\//\\\//g;
assert_script_run "sed -i '/$tmp_lun/d' $lun_list_file";
}
else {
# Add the hostname as a tag in the LUN file
# So in next call, get_lun will not return this LUN for this host
assert_script_run "sed -i -E 's;^($lun([[:blank:]]|\$).*);\\1 $hostname;' $lun_list_file";
}
# Copy the modified file on the support server (for the other nodes)
if (get_var('USE_SUPPORT_SERVER')) {
exec_and_insert_password "scp -o StrictHostKeyChecking=no $lun_list_file root\@ns:$lun_list_file";
}
else {
assert_script_run "cp $lun_list_file $supportdir/$cluster_name-lun.list";
}
mutex_unlock 'iscsi';
# Return the real path of the block device
return $lun;
}
=head2 check_device_available
check_device_available( $device, [ $timeout ] );
Checks for the presence of a device in the SUT for up to a defined timeout
(defaults to 20 seconds). Returns 0 on success, or croaks on failure.
=cut
sub check_device_available {
my ($dev, $tout) = @_;
my $ret;
my $tries = bmwqemu::scale_timeout($tout ? int($tout / 2) : 10);
die "Must provide a device for check_device_available" unless (defined $dev);
while ($tries and $ret = script_run "ls -la $dev") {
--$tries;
sleep 2;
}
_test_var_defined $ret;
die "Device $dev not found" unless ($tries > 0 or $ret == 0);
return $ret;
}
=head2 set_lvm_config
set_lvm_config( $lvm_config_file, [ use_lvmetad => $val1, locking_type => $val2, use_lvmlockd => $val3, ... ] );
Configures the LVM parameters/values pairs passed as a HASH into the LVM configuration
file specified by the first argument B<$lvm_config_file>. These LVM parameters are
usually B<use_lvmetad>, B<locking_type> and B<use_lvmlockd> but any other existing
parameter from the LVM configuration file is also valid. Parameters that do not exist
in the LVM configuration file in SUT will be ignored. Returns 0 on success or croaks
on failure.
=cut
sub set_lvm_config {
my ($lvm_conf, %args) = @_;
my $cmd;
foreach my $param (keys %args) {
$cmd = sprintf("sed -ie 's/^\\([[:blank:]]*%s[[:blank:]]*=\\).*/\\1 %s/' %s", $param, $args{$param}, $lvm_conf);
assert_script_run $cmd;
}
script_run "grep -E '^[[:blank:]]*use_lvmetad|^[[:blank:]]*locking_type|^[[:blank:]]*use_lvmlockd' $lvm_conf";
}
=head2 add_lock_mgr
add_lock_mgr( $lock_manager, [ force => bool ] );
Configures a B<$lock_manager> resource in the cluster configuration on SUT.
B<$lock_mgr> usually is either B<clvmd> or B<lvmlockd>, but any other cluster
primitive could work as well.
Takes a second named argument B<force> which if set to true will add C<--force>
to the B<crmsh> command. Should be used with care. Defaults to false.
=cut
sub add_lock_mgr {
my ($lock_mgr, %args) = @_;
$args{force} //= 0;
my $cmd = join(' ', 'crm', ($args{force} ? '--force' : ''), 'configure', 'edit');
assert_script_run "EDITOR=\"sed -ie '\$ a primitive $lock_mgr ocf:heartbeat:$lock_mgr'\" $cmd";
assert_script_run "EDITOR=\"sed -ie 's/^\\(group base-group.*\\)/\\1 $lock_mgr/'\" $cmd";
# Wait to get clvmd/lvmlockd running on all nodes
sleep 5;
}
sub pre_run_hook {
my ($self) = @_;
if (isotovideo::get_version() == 12) {
$prev_console = $autotest::selected_console;
} else {
# perl -c will give a "only used once" message
# here and this makes the ci tests fail.
1 if defined $testapi::selected_console;
$prev_console = $testapi::selected_console;
}
}
sub post_run_hook {
my ($self) = @_;
return unless ($prev_console);
select_console($prev_console, await_console => 0);
if ($prev_console eq 'x11') {
ensure_unlocked_desktop;
}
else {
$self->clear_and_verify_console;
}
}