Skip to content

Commit

Permalink
Merge pull request #1486 from mudler/worker_optimizations
Browse files Browse the repository at this point in the history
Worker worker_status timer optimizations
  • Loading branch information
foursixnine authored Nov 6, 2017
2 parents 4da589c + b71811f commit 9575846
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 91 deletions.
21 changes: 21 additions & 0 deletions lib/OpenQA/Utils.pm
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ $VERSION = sprintf "%d.%03d", q$Revision: 1.12 $ =~ /(\d+)/g;
read_test_modules
exists_worker
safe_call
feature_scaling
logistic_map_steps
logistic_map
rand_range
in_range
);

if ($0 =~ /\.t$/) {
Expand Down Expand Up @@ -839,5 +844,21 @@ sub safe_call {
return $ret;
}

# Args:
# First is i-th element, Second is maximum element number, Third and Fourth are the range limit (lower and upper)
# $i, $imax, MIN, MAX
sub feature_scaling { $_[2] + ((($_[0] - 1) * ($_[3] - $_[2])) / (($_[1] - 1) || 1)) }
# $r, $xn
sub logistic_map { $_[0] * $_[1] * (1 - $_[1]) }
# $steps, $r, $xn
sub logistic_map_steps {
$_[2] = 0.1 if $_[2] <= 0; # do not let population die. - with this change we get more "chaos"
$_[2] = logistic_map($_[1], $_[2]) for (1 .. $_[0]);
$_[2];
}
sub rand_range { $_[0] + rand($_[1] - $_[0]) }
sub in_range { $_[0] >= $_[1] && $_[0] <= $_[2] ? 1 : 0 }


1;
# vim: set sw=4 et:
51 changes: 27 additions & 24 deletions lib/OpenQA/WebSockets/Server.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ use OpenQA::Utils qw(log_debug log_warning log_error);
use OpenQA::Schema;
use OpenQA::ServerStartup;
use Data::Dumper;

use Data::Dump 'pp';
use db_profiler;

use constant WORKERS_CHECKER_THRESHOLD => 120;

require Exporter;
our (@ISA, @EXPORT, @EXPORT_OK);

Expand Down Expand Up @@ -200,19 +202,7 @@ sub _message {
}

$worker->{last_seen} = time();
if ($json->{type} eq 'ok') {
$ws->tx->send({json => {type => 'ok'}});
# NOTE: Update the worker state from keepalives.
# We could check if the worker is dead before updating seen state
# the downside of it will be that we will have more timewindows
# where the worker is seen as dead.
#
# if ($w and $w->dead()) # It's still one query, at this point let's just update the seen status
# log_debug("Keepalive from worker $worker->{id} received, and worker thought dead. updating the DB");
app->schema->txn_do(sub { my $w = app->schema->resultset("Workers")->find($worker->{id}); $w->seen; })
if $worker && exists $worker->{id};
}
elsif ($json->{type} eq 'accepted') {
if ($json->{type} eq 'accepted') {
my $jobid = $json->{jobid};
log_debug("Worker: $worker->{id} accepted job $jobid");
}
Expand Down Expand Up @@ -247,14 +237,18 @@ sub _message {
$worker_status->{$wid} = $json;
log_debug(sprintf('Received from worker "%u" worker_status message "%s"', $wid, Dumper($json)));

# XXX: This would make keepalive useless.
# app->schema->txn_do(
# sub {
# my $w = app->schema->resultset("Workers")->find($wid);
# return unless $w;
# log_debug("Updated worker seen from worker_status");
# $w->seen;
# });
try {
app->schema->txn_do(
sub {
my $w = app->schema->resultset("Workers")->find($wid);
return unless $w;
log_debug("Updated worker seen from worker_status");
$w->seen;
});
}
catch {
log_error("Failed updating worker seen status: $_");
};

my $registered_job_id;
my $registered_job_token;
Expand All @@ -266,6 +260,15 @@ sub _message {
if $worker_status->{$wid}->{job}->{id};
};

try {
my $workers_population = app->schema->resultset("Workers")->count();
my $msg = {type => 'info', population => $workers_population};
$ws->tx->send({json => $msg} => sub { log_debug("Sent population to worker: " . pp($msg)) });
}
catch {
log_debug("Could not be able to send population number to worker: $_");
};

try {
# We cover the case where id can be the same, but the token will differ.
die "Do not check" unless ($registered_job_id);
Expand Down Expand Up @@ -327,6 +330,7 @@ sub _message {
catch {
log_debug("Failed parsing status message : $_");
};

}
else {
log_error(sprintf('Received unknown message type "%s" from worker %u', $json->{type}, $worker->{id}));
Expand Down Expand Up @@ -384,8 +388,7 @@ sub _workers_checker {
try {
$schema->txn_do(
sub {
my $threshold = 40;
my $stale_jobs = _get_stale_worker_jobs($threshold);
my $stale_jobs = _get_stale_worker_jobs(WORKERS_CHECKER_THRESHOLD);
for my $job ($stale_jobs->all) {
next unless _is_job_considered_dead($job);

Expand Down
16 changes: 9 additions & 7 deletions lib/OpenQA/Worker/Commands.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use OpenQA::Worker::Common;
use OpenQA::Worker::Jobs;
use POSIX ':sys_wait_h';
use OpenQA::Worker::Engines::isotovideo;
use Data::Dump 'pp';

## WEBSOCKET commands
sub websocket_commands {
Expand All @@ -35,12 +36,11 @@ sub websocket_commands {
}
}
else {
# requests
my $type = $json->{type};
if (!$type) {
log_warning('Received WS message without type!');
if (!$json->{type}) {
log_warning('Received WS message without type! ' . pp($json));
return;
}
my $type = $json->{type};
my $jobid = $json->{jobid} // '';
my $joburl;
my $host = $ws_to_host->{$tx};
Expand Down Expand Up @@ -70,6 +70,11 @@ sub websocket_commands {
log_debug("received command: $type");
stop_job($type);
}
elsif ($type eq 'info') {
$hosts->{$host}{population} = $json->{population} if $json->{population};
log_debug("Population for $host is " . $hosts->{$host}{population});
change_timer("workerstatus-$host", OpenQA::Worker::Common::calculate_status_timer($hosts, $host));
}
elsif ($type eq 'stop_waitforneedle') {
if (backend_running) {
$ua->post("$joburl/isotovideo/stop_waitforneedle");
Expand Down Expand Up @@ -129,9 +134,6 @@ sub websocket_commands {
}
}
}
elsif ($type eq 'ok') {
# ignore keepalives, but dont' report as unknown
}
elsif ($type eq 'grab_job') {
state $check_job_running;
state $job_in_progress;
Expand Down
83 changes: 74 additions & 9 deletions lib/OpenQA/Worker/Common.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ use Carp;
use POSIX 'uname';
use Mojo::URL;
use OpenQA::Client;
use OpenQA::Utils qw(log_error log_debug log_warning log_info);
use OpenQA::Utils qw(log_error log_debug log_warning log_info), qw(feature_scaling rand_range logistic_map_steps);
use Scalar::Util 'looks_like_number';
use Config::IniFiles;
use List::Util 'max';

use base 'Exporter';
our @EXPORT = qw($job $instance $worker_settings $pooldir $nocleanup
Expand Down Expand Up @@ -56,6 +59,8 @@ my ($sysname, $hostname, $release, $version, $machine) = POSIX::uname();
use constant {
STATUS_UPDATES_SLOW => 10,
STATUS_UPDATES_FAST => 0.5,
MAX_TIMER => 100, # It should never be more than OpenQA::WebSockets::Server::_workers_checker threshold
MIN_TIMER => 20,
};

# the template noted what architecture are known
Expand Down Expand Up @@ -346,29 +351,56 @@ sub send_status {
$tx->send($status_message);
}

sub calculate_status_timer {
my ($hosts, $host) = @_;
my $i = $hosts->{$host}{workerid} ? $hosts->{$host}{workerid} : looks_like_number($instance) ? $instance : 1;
my $imax = $hosts->{$host}{population} ? $hosts->{$host}{population} : 1;
my $scale_factor = $imax;
my $steps = 215;
my $r = 3.81199961;

# my $scale_factor = 4;
# my $scale_factor = (MAX_TIMER - MIN_TIMER)/MIN_TIMER;
# log_debug("I: $i population: $imax scale_factor: $scale_factor");

# XXX: we are using now fixed values, to stick with a
# predictable behavior but random intervals
# seems to work as well.
# my $steps = int(rand_range(2, 120));
# my $r = rand_range(3.20, 3.88);

my $population = feature_scaling($i, $imax, 0, 1);
my $status_timer
= abs(feature_scaling(logistic_map_steps($steps, $r, $population) * $scale_factor, $imax, MIN_TIMER, MAX_TIMER));
$status_timer = $status_timer > MIN_TIMER
&& $status_timer < MAX_TIMER ? $status_timer : $status_timer > MAX_TIMER ? MAX_TIMER : MIN_TIMER;
return int($status_timer);
}

sub call_websocket {
my ($host, $ua_url) = @_;
my $ua = $hosts->{$host}{ua};
my $status_timer = calculate_status_timer($hosts, $host, $instance, $worker_settings);

log_debug("worker_status timer time window: $status_timer");
$ua->websocket(
$ua_url => {'Sec-WebSocket-Extensions' => 'permessage-deflate'} => sub {
my ($ua, $tx) = @_;
if ($tx->is_websocket) {
# keep websocket connection busy
$tx->send({json => {type => 'ok'}}); # Send keepalive immediately
$hosts->{$host}{timers}{keepalive}
= add_timer("keepalive-$host", 10, sub { $tx->send({json => {type => 'ok'}}); });

$hosts->{$host}{timers}{status} = add_timer("workerstatus-$host", 15,
sub { send_status($tx); log_debug("Sending worker status to $host (workerstatus timer)"); });
$hosts->{$host}{timers}{status} = add_timer(
"workerstatus-$host",
$status_timer,
sub {
send_status($tx);
log_debug("Sending worker status to $host (workerstatus timer)");
});

$tx->on(json => \&OpenQA::Worker::Commands::websocket_commands);
$tx->on(
finish => sub {
my (undef, $code, $reason) = @_;
log_debug("Connection turned off from $host - $code : "
. (defined $reason ? $reason : "Not specified"));
remove_timer("keepalive-$host");
remove_timer("workerstatus-$host");

$hosts->{$host}{timers}{setup_websocket}
Expand Down Expand Up @@ -520,4 +552,37 @@ sub verify_workerid {
return $hosts->{$host}{workerid};
}

sub read_worker_config {
my ($instance, $host) = @_;
my $worker_dir = $ENV{OPENQA_CONFIG} || '/etc/openqa';
my $cfg = Config::IniFiles->new(-file => $worker_dir . '/workers.ini');

my $sets = {};
for my $section ('global', $instance) {
if ($cfg && $cfg->SectionExists($section)) {
for my $set ($cfg->Parameters($section)) {
$sets->{uc $set} = $cfg->val($section, $set);
}
}
}
# use separate set as we may not want to advertise other host confiuration to the world in job settings
my $host_settings;
$host ||= $sets->{HOST} ||= 'localhost';
delete $sets->{HOST};
my @hosts = split / /, $host;
for my $section (@hosts) {
if ($cfg && $cfg->SectionExists($section)) {
for my $set ($cfg->Parameters($section)) {
$host_settings->{$section}{uc $set} = $cfg->val($section, $set);
}
}
else {
$host_settings->{$section} = {};
}
}
$host_settings->{HOSTS} = \@hosts;

return $sets, $host_settings;
}

1;
3 changes: 1 addition & 2 deletions script/openqa-websockets
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ BEGIN {
$ENV{MOJO_LISTEN} ||= 'http://localhost:9527/';

# allow up to 20GB - hdd images
$ENV{MOJO_MAX_MESSAGE_SIZE} = 1024 * 1024 * 1024 * 20;
$ENV{MOJO_INACTIVITY_TIMEOUT} = 300;
$ENV{MOJO_MAX_MESSAGE_SIZE} = 1024 * 1024 * 1024 * 20;

use OpenQA::WebSockets;

Expand Down
36 changes: 1 addition & 35 deletions script/worker
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ BEGIN {

use FindBin;
use lib "$FindBin::Bin/../lib";
use Config::IniFiles;
use Getopt::Long;
Getopt::Long::Configure("no_ignore_case");

Expand All @@ -124,43 +123,10 @@ GetOptions(

usage(0) if ($options{help});

sub read_worker_config {
my ($instance, $host) = @_;
my $worker_dir = $ENV{OPENQA_CONFIG} || '/etc/openqa';
my $cfg = Config::IniFiles->new(-file => $worker_dir . '/workers.ini');

my $sets = {};
for my $section ('global', $instance) {
if ($cfg && $cfg->SectionExists($section)) {
for my $set ($cfg->Parameters($section)) {
$sets->{uc $set} = $cfg->val($section, $set);
}
}
}
# use separate set as we may not want to advertise other host confiuration to the world in job settings
my $host_settings;
$host ||= $sets->{HOST} ||= 'localhost';
delete $sets->{HOST};
my @hosts = split / /, $host;
for my $section (@hosts) {
if ($cfg && $cfg->SectionExists($section)) {
for my $set ($cfg->Parameters($section)) {
$host_settings->{$section}{uc $set} = $cfg->val($section, $set);
}
}
else {
$host_settings->{$section} = {};
}
}
$host_settings->{HOSTS} = \@hosts;

return $sets, $host_settings;
}

# count workers from 1 if not set - if tap devices are used worker would try to use tap -1
$options{instance} ||= 1;

my ($worker_settings, $host_settings) = read_worker_config($options{instance}, $options{host});
my ($worker_settings, $host_settings) = OpenQA::Worker::Common::read_worker_config($options{instance}, $options{host});
$worker_settings->{LOG_LEVEL} = 'debug' if $options{verbose};
$OpenQA::Worker::Common::worker_settings = $worker_settings;
# XXX: this should be sent to the scheduler to be included in the worker's table
Expand Down
Loading

0 comments on commit 9575846

Please sign in to comment.