forked from openSUSE/open-build-service
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathelastic_build_service_manager.py
591 lines (494 loc) · 22.6 KB
/
elastic_build_service_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
#!/usr/bin/python
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Copyright (C) 2011 Tieto Corporation
# Contact information: Ville Seppanen, [email protected]
# Updated: 2011-06-20
# Tested with: OBS 2.1.6, Boto 2.0b4, Python 2.6.5, openSUSE 11.3
#
#
# Elastic Build Service Manager
#
# This script is used to control the cloudbursting of an Elastic Build Service
# that is based on openSUSE Build Service (OBS). The idea is to use virtual
# machines from IaaS providers as build hosts. This script will fetch metrics
# of virtual machines from Amazon Web Services (AWS) as well as build job
# metrics from local OBS server. Based on these metrics this script will
# decide whether to create or destroy virtual machines from AWS EC2. Running
# this script may cost money. This is designed to run as a cron job every
# minute or two.
#
#
# Prerequisites for successfully running this script:
# - AWS account
# - access to an Amazon Machine Image (AMI) that will on startup:
# -- create a VPN tunnel to your OBS server network
# -- start OBS worker(s)
# - private installation of OBS
# - Boto installed http://boto.cloudhackers.com
#
# Example .boto config:
# [Credentials]
# aws_access_key_id = AVTIOJERVAIOEJTIAOTVJ
# aws_secret_access_key = a369840983a6n03a986bah34098g
# [Boto]
# debug = 1
# num_retries = 3
# https_validate_certificates = True
#
# Example .oscrc config:
# [general]
# # URL to access API server
# apiurl = http://localhost:81
# # run osc to get these auto-generated for you
# [http://localhost:81]
# user=exampleuser
# passx=yzerMZYRNrzyNYZRNYRnxdryXNDRYDXNRY
#
# Refer to OBS and boto documentation for more help.
#
# TO-DO LIST:
# - fail nicely if AWS or OBS are unreachable or reject requests
# - allow more customization info to be passed through (e.g. IP addresses)
# - use libcloud to avoid AWS lock-in, use separate monitoring software
# - lock the script to prevent multiple running at the same time
import osc.conf # for OBS usage
import osc.core # for OBS usage
from xml.dom import minidom # for XML parsing from OBS API
import boto.ec2 # Python library for EC2 access
import boto.ec2.cloudwatch # for EC2 monitoring info
import boto.utils
from boto.exception import BotoServerError
import time
import datetime # for timestamps
from pprint import pprint # for debug prints
import sys # for arguments
### Configuration variables ###################################################
# Path to log file (if you run this script manually, make sure you have rights
# to write to the log as well)
log_path = "/home/user/build_service_manager.log"
# ID of the AWS Machine Image (AMI) that this script will boot. This image
# should contain obs-worker that autoconnects to your OBS server on boot.
aws_ami_image = "ami-47cefa33"
# AWS keypair to use with the machines. This is needed for SSH connections to
# the workers (which are needed for e.g. debugging).
aws_keypair_name = "MyKeyPair"
# Name of the AWS security group to use. No open ports is ok as your worker
# should create a VPN outwards.
aws_security_group = "MySecurityGroup"
# Type of AWS instance to use. For the first year, t1.micro is free and good
# for testing, but with only 600MB of RAM it fails bigger builds.
# Good choices: t1.micro (for testing), m1.small, c1.medium
aws_instance_type = "m1.small"
# AWS region to use and CloudWatch (AWS monitoring) address to use
aws_cloudwatch_url = "monitoring.eu-west-1.amazonaws.com"
aws_region = "eu-west-1"
# If you want more debugging info than what you get by adding debug=2 to the
# .boto config file, uncomment the following line.
# boto.set_stream_logger('debug')
### Function declarations #####################################################
# Starts a new VM instance in EC2. Parameter obs_worker_id is set as the
# hostname and as EC2 instance metatag "Name" to identify build hosts.
def run_instance(obs_worker_id):
global print_only
if not print_only:
# WARNING: Do NOT start more than 1 instance at a time here!
# Multilaunching is managed at higher level to create unique tags etc.
instances_min = 1
instances_max = 1
worker_customization = "#!/bin/sh\nhostname " + obs_worker_id +\
";echo " + obs_worker_id + " > /etc/HOSTNAME;"
global aws_ami_image
global aws_keypair_name
global aws_security_group
global aws_instance_type
reservation = ec2.run_instances(
image_id = aws_ami_image,
min_count = instances_min,
max_count = instances_max,
key_name = aws_keypair_name,
security_groups = [aws_security_group],
user_data = worker_customization,
instance_type = aws_instance_type)
#placement='eu-west-1b')
instance = reservation.instances[0]
ec2.create_tags([instance.id],{'Name':obs_worker_id})
global elastic_build_hosts
elastic_build_hosts.append({'instance_name':obs_worker_id,
'instance_id':instance.id,
'instance_type':instance.instance_type,
'cpu':"-", 'disk':"-",
'instance_state':"SPAWNING",
'launch_time':0,
'time_left':60, 'ip':"-",
'workers':[]})
return
# Fetches metrics from OBS and EC2 and saves the data to parameters
# elastic_build_hosts and job_status.
def fetch_metrics(elastic_build_hosts, job_status):
global cloud_buildhost_prefix
global local_buildhost_prefix
global aws_cloudwatch_url
global debug_status_string
cw = boto.connect_cloudwatch(host=aws_cloudwatch_url,port=443)
# Get EC2 metrics
# returns a list of boto.ec2.instance.Reservation
for machine in ec2.get_all_instances():
instance = machine.instances[0]
# tags are not required and thus may not exist
try:
name = instance.tags['Name']
except Exception:
name = "Unnamed"
date_now = datetime.datetime.now()
launch_time = datetime.datetime.strptime(
instance.launch_time,
"%Y-%m-%dT%H:%M:%S.000Z")
time_left = (3600-(((date_now - launch_time).seconds)%3600))/60
cpu=-1
ip_address = "-"
if instance.ip_address != None:
ip_address = instance.ip_address
# Fetch instance CPU utilization metrics from CloudWatch
try:
end_time = date_now
start_time = end_time - datetime.timedelta(minutes=15)
stats = cw.get_metric_statistics(
300,
start_time,
end_time,
'CPUUtilization',
'AWS/EC2',
'Average',
{"InstanceId":instance.id})
# Find latest value from the history list
latest_value = -1
latest_time = start_time
for value in stats:
if value['Timestamp'] > latest_time:
latest_time = value['Timestamp']
latest_value = value['Average']
# Let's make sure only absolute zero is shown as zero
if latest_value > 0 and latest_value < 1:
cpu = 1
else:
cpu = int(latest_value)
except BotoServerError as serverErr:
dbg(json.dumps({
"error": "Error retrieving CloudWatch metrics."
}))
if name.startswith(cloud_buildhost_prefix):
elastic_build_hosts.append({'instance_name':name,
'instance_id':instance.id,
'instance_type':instance.instance_type,
'cpu':cpu, 'disk':-1,
'instance_state':instance.state,
'launch_time':launch_time,
'time_left':time_left, 'ip':ip_address,
'workers':[]})
# Get OBS metrics
# Initialize osc configuration, API URL and credentials
osc.conf.get_config()
api = osc.conf.config['apiurl']
apitest = osc.core.http_GET(api + '/worker/_status')
dom = minidom.parseString(apitest.read())
# store all idle workers
for node in dom.getElementsByTagName('idle'):
build_host_ok = False
# parse build host id from the worker id
parsed_hostname = node.getAttribute('workerid')\
[0:node.getAttribute('workerid').find("/")]
if parsed_hostname.startswith(cloud_buildhost_prefix):
# try to find it from the list
for build_host in elastic_build_hosts:
if build_host['instance_name'] == parsed_hostname:
build_host['workers'].append('IDLE')
build_host_ok = True
break
elif parsed_hostname.startswith(local_buildhost_prefix):
build_host_ok = True # ignore local build hosts
debug_status_string += "."
if not build_host_ok:
dbg("WARN - Strange host " + parsed_hostname)
# store all busy workers
for node in dom.getElementsByTagName('building'):
build_host_ok = False
# parse build host id from the worker id
parsed_hostname = node.getAttribute('workerid')\
[0:node.getAttribute('workerid').find("/")]
if parsed_hostname.startswith(cloud_buildhost_prefix):
for build_host in elastic_build_hosts:
if build_host['instance_name'] == parsed_hostname:
build_host['workers'].append(node.getAttribute('package'))
build_host_ok = True
break
elif parsed_hostname.startswith(local_buildhost_prefix):
build_host_ok = True # ignore local build hosts
debug_status_string += "o"
if not build_host_ok:
dbg("WARN - Strange host " + parsed_hostname + " or not building")
# count the total amount of waiting jobs
for node in dom.getElementsByTagName('waiting'):
jobs_status['jobs_waiting_sum'] += int(node.getAttribute('jobs'))
# count the total amount of blocked jobs
for node in dom.getElementsByTagName('blocked'):
jobs_status['jobs_blocked_sum'] += int(node.getAttribute('jobs'))
return
# Creates the initial connection to AWS EC2 to the region specified in
# parameter region. Region names can be obtained from AWS.
def connect_to_ec2(region):
connection = boto.ec2.connect_to_region(region)
dbg("INFO - Connected to: " + connection.host + ":" + str(connection.port))
dbg("INFO - Secure connection: " + str(connection.is_secure))
return connection
# Terminates the VM instance in EC2 specified by the instance_id parameter.
def terminate_instance(instance_id):
global print_only
if not print_only:
ec2.terminate_instances([instance_id])
return
# Prints a table of cloud workers and their status for debugging purposes.
def print_metrics(elastic_build_hosts, job_status):
print "LISTING ALL BUILD HOSTS\n",'EC2 ID'.ljust(10), 'HOSTNAME'.ljust(14),\
'TYPE'.ljust(10),'CPU'.ljust(3), 'STATE'.ljust(13), 'TTL'.ljust(3),\
'IP'.ljust(15), 'CURRENT JOB'
print "__________________________________________________" +\
"____________________________________________________"
for host in elastic_build_hosts:
workers = "Not connected to OBS"
if host['instance_state'] == 'stopped' or \
host['instance_state'] == 'terminated':
workers = "-"
if len(host['workers']) > 0:
workers = ""
for job in host['workers']:
workers = workers + job + " "
# Show time to next bill only if the machine is actually running
time_left = "-"
if host['instance_state'] == 'running':
time_left = str(host['time_left'])
cpu = "-"
if host['cpu'] >= 0:
cpu = str(host['cpu'])
print host['instance_id'].ljust(10), host['instance_name'].ljust(14),\
host['instance_type'].ljust(10), cpu.ljust(3), \
host['instance_state'].ljust(13), \
time_left.ljust(3), host['ip'].ljust(15), workers
return
# Prints a debug message if the script is ran in verbose mode and not ran
# with cron-mode.
def dbg(message):
global cron_mode
global verbose
if verbose and not cron_mode:
print datetime.datetime.now(), message
return
# Writes a message to a log file, summarizing current situation and showing
# changes made this run. Writes a single line per script run. Appends "MAN"
# to the end of the line if the script was ran manually and not with cron.
def log_write(message):
global cron_mode
manual_mode_notice = " MAN"
if cron_mode:
manual_mode_notice = ""
# Log only real actual events and not simulated runs
global print_only
global log_path
if not print_only:
f = open(log_path, 'a')
f.write(str(datetime.datetime.now()) + " " + message +
manual_mode_notice + "\n")
f.close()
return
# Starts a new build host and figures out a unique name for it.
def start_new():
global cloud_buildhost_prefix
time.sleep(1) # ugly hack to make sure everyone gets a unique name
obs_worker_id = cloud_buildhost_prefix + str(int(time.time()))[2:]
run_instance(obs_worker_id)
return
# This function will decide to either create or terminate instances or to do
# nothing. It will also try to detect various problematic situations.
def analyze_current_situation(elastic_build_hosts, jobs):
global debug_status_string
global cloud_buildhost_prefix
global cron_mode
# These variables are to balance between efficiency and minimal flapping
# Time in minutes how close idle workers should live until new fee
ttl_threshold = 4
# Max time a host can take to connect to server, must be less than 60 -
# ttl_threshold. 5 mins should be enough for a normal limit (small booted
# in 2,5mins). This should also include the time to start a job from the
# server. 3-5 should be good.
booting_time = 5
max_kills_per_run = 5 # Max amount of hosts to terminate on one run
max_hosts = 15 # Max number of _running_ instances at any time
max_instances_starting_up = 5 # Max amount of hosts starting at same time
cpu_too_high = 20 # CPU percentage that is considered too high for idle
cpu_too_low = 5 # CPU percentage that is considered too low for building
instances_terminated_this_cycle = 0
instances_started_this_cycle = 0
instances_alive = 0 # running, pending, i.e. not shutdown or terminated
instances_starting_up = 0
#### KILL BAD AND UNNECESSARY BUILD HOSTS
# Check what every build host is doing currently and whether it is ok
for host in elastic_build_hosts:
# This is implemented in a reverse fashion so that every machine
# will be terminated unless there is a specific reason to let it live
should_be_terminated = True
name = host['instance_name']
# Check all not terminated workers
if ((host['instance_state'] == 'running') or \
(host['instance_state'] == 'pending')) and \
host['instance_name'].startswith(cloud_buildhost_prefix):
instances_alive += 1
workers_total = len(host['workers'])
# Check connection to OBS server
if workers_total <= 0:
dbg("WARN - " + name + " is not connected")
debug_status_string += "C" # not connected
else:
# Count idle workers
workers_idle = 0
for worker in host['workers']:
if worker == 'IDLE':
workers_idle += 1
# All workers are idle
if workers_total == workers_idle:
if host['cpu'] > cpu_too_high:
dbg("WARN - " + name + " has high cpu (" +\
str(host['cpu']) + ") for idle. Crashed?")
if jobs['jobs_waiting_sum'] > 0:
dbg("ERR - "+name+" has all idle but there's work")
debug_status_string += "L" # lazy worker
else:
if host['time_left'] < ttl_threshold:
dbg("OK - " + name + " idle and time to die.")
debug_status_string += "u" # unemployed
else:
dbg("OK - " + name + " is idle, wait " +\
str(host['time_left']-ttl_threshold) +\
" more mins")
debug_status_string += "i" # idle
should_be_terminated = False
# All workers are working
elif workers_idle == 0:
if host['cpu'] < cpu_too_low:
dbg("WARN - " + name + " has quite low cpu (" +\
str(host['cpu'])+ ") for building.")
debug_status_string += "W" # working + warning
should_be_terminated = False
else:
dbg("OK - " + name + " has all workers busy.")
debug_status_string += "w" # working
should_be_terminated = False
# Some are working and some idling
else:
if jobs['jobs_waiting_sum'] > 0:
dbg("WARN - " + name +\
" some workers idle but there's work")
debug_status_string += "M" # working + warning
should_be_terminated = False
else:
dbg("OK - " + name + " some workers idle, no jobs")
debug_status_string += "m" # working
should_be_terminated = False
# Terminate extra worker
age = ((datetime.datetime.now() - host['launch_time']).seconds)/60
if should_be_terminated:
if age < booting_time:
instances_starting_up += 1
dbg("OK - " + name + " is " + str(age) +\
" mins old and may still be starting up. Give it "\
+ str(booting_time-(60 - host['time_left'])) +\
" more minutes to boot.")
elif instances_terminated_this_cycle >= max_kills_per_run:
dbg("WARN - max amount of kills reached.")
else:
instances_terminated_this_cycle += 1
instances_alive -= 1
dbg("TERM - " + host['instance_id'])
terminate_instance(host['instance_id'])
host['instance_state'] = "TERMINATING"
debug_status_string += "-" # terminating instance
dbg("INFO - alive:"+str(instances_alive) + ", terminated_this_cycle:" +\
str(instances_terminated_this_cycle) +", jobs:" +\
str(jobs['jobs_waiting_sum']) + ", starting:" +\
str(instances_starting_up))
#### START NEW BUILD HOSTS IF NEEDED
if jobs['jobs_waiting_sum'] > 0:
dbg("OK - " + str(jobs['jobs_waiting_sum']) + " jobs ready, spawn!")
# Start more if limits have not been met
# TODO: This expects that 1 build host == 1 worker == 1 core!
while (jobs['jobs_waiting_sum']-instances_starting_up) > 0 and \
(instances_alive <= max_hosts) and \
(max_instances_starting_up > instances_starting_up):
start_new()
instances_alive += 1
instances_starting_up += 1
instances_started_this_cycle += 1
debug_status_string += "+" # starting instance
dbg("INFO - alive:"+str(instances_alive) + ", started_this_cycle:" +\
str(instances_started_this_cycle) +", jobs:" +\
str(jobs['jobs_waiting_sum']) + ", starting:" +\
str(instances_starting_up))
#### WRITE TO LOG
if instances_started_this_cycle > 0 or \
instances_terminated_this_cycle > 0 or not cron_mode:
log_write("srvman: job_queue=" + str(jobs['jobs_waiting_sum']) +"+"+\
str(jobs['jobs_blocked_sum']) + " [" +\
debug_status_string + "]")
return
### Main script ###############################################################
# Argument handling
print_only = False
cron_mode = False
verbose = False
for arg in sys.argv:
if arg == "--print-only":
print_only = True
break
elif arg == "--cron-mode":
cron_mode = True
break
elif arg == "--verbose":
verbose = True
elif arg == "--help":
print "Elastic Build Service Manager\nUsage: esm [OPTIONS]\n",\
" --print-only: no changes are made to the cluster\n",\
" --cron-mode: no printing, logs cluster changes",\
" --verbose: print more debug messages in non-cron-mode"
exit()
# These are used in hostnames to identify different types of build hosts
cloud_buildhost_prefix = "cbh-"
local_buildhost_prefix = "lbh-"
# create datastorages for all metrics
elastic_build_hosts = []
jobs_status = {'jobs_waiting_sum':0,
'jobs_blocked_sum':0}
debug_status_string = ""
if cron_mode:
# OBS updates its metrics with cron, let's not fetch metrics from OBS
# at the same time. Seemed to work ok without this sleep though.
time.sleep(20)
# connect to a specific EC2 region
ec2 = connect_to_ec2(aws_region)
# fetch all metrics from OBS and EC2
fetch_metrics(elastic_build_hosts, jobs_status)
# figure out current status and what should be done
analyze_current_situation(elastic_build_hosts, jobs_status)
# print metrics for debug usage
if not cron_mode:
print_metrics(elastic_build_hosts, jobs_status)