Skip to content

GPU status endpoints #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: gpu_cluster_grows_up
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions config.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ config = {
"db": "/Users/naren/Developer/gpu-cluster/gpu_cluster_instances.db",
"gpuless": False,
"redis": "redis://localhost:6379/0",
"price_per_hour": -1
"price_per_hour": -1,
"domain_name": "vault.acm.illinois.edu"
}

#ACM Specific
acm_config = {
"groot_authorization": None
}
}
2 changes: 1 addition & 1 deletion gpu_cluster/controllers/container_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class ContainerController(abc.ABC):
'''
def __init__(self, config):
super().__init__()
self.config = config
self.hourly_rate = config["price_per_hour"]

'''
Expand All @@ -37,7 +38,6 @@ def launch_container(self, c_id):
'''
STATUS QUERIERS
'''

def verify_launch(self, c_id):
instance = db_session.query(Instance).filter_by(cid = c_id).first()
return True if instance.launched == True else False
Expand Down
51 changes: 35 additions & 16 deletions gpu_cluster/controllers/cpu_container_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,62 @@
from .container_controller import ContainerController
import docker


class CPUContainerController(ContainerController):

def __init__(self, config):
super().__init__(config)
self.client = docker.from_env(version='auto')

def create_container(self, image, user="", token_required=False, budget=-1):
uport = self.get_port()
mport = self.get_port()
uport = super().get_port()
mport = super().get_port()
while uport == mport:
mport = self.get_port()
mport = super().get_port()

ports = {'8888/tcp':uport,
'6006/tcp':mport}
ports = {'8888/tcp': uport,
'6006/tcp': mport}

print(image)
c_id = self.client.containers.run(image, "", auto_remove=True, detach=True, ports=ports).id
container_list = self.client.containers.list(filters={'name': image})
if container_list:
c_id = self.client.containers.run(image, "", auto_remove=True, detach=True, ports=ports).id

else:
# Add a client.images.search to check if the path to the container exists on docker hub. If not, error out
has_result = self.client.images.search(image)
if not has_result:
print('No image in DockerHub')
return 'No image in DockerHub' , '', ''

image_tag = image.split(':')
docker_image = self.client.images.pull(image_tag[0], image_tag[1])

# If pull returns more than one image, get the first one in the list
if hasattr(docker_image, '__len__'):
docker_image = docker_image[0]

# Do you have to build the image after you pull it from Docker Hub?
c_id = self.client.containers.run(docker_image, '', auto_remove=True, detach=True, ports=ports).id
print(c_id)

uurl = ""
murl = ""
base_url = "http://{}".format(self.config["domain_name"])
if token_required:
c = self.client.containers.get(c_id)
token = c.exec_run('python3 /opt/cluster-container/jupyter_get.py')
uurl = "http://localhost:{}/?token={}".format(uport, token.decode("utf-8") )
murl = "http://localhost:" + str(mport)
uurl = "{}:{}/?token={}".format(base_url, uport, token.decode("utf-8") )
murl = base_url + str(mport)
else:
uurl = "http://localhost:" + str(uport)
murl = "http://localhost:" + str(mport)
uurl = base_url + str(uport)
murl = base_url + str(mport)
print(image)

#TODO insert budget
db_session.add(Instance(c_id, uport, mport, uurl, murl, user, budget, token))
# TODO insert budget
db_session.add(Instance(c_id, uport, mport, uurl, murl, user, budget, token))
db_session.commit()
return c_id, uurl, murl

def kill_container(self, c_id):
c = self.client.containers.get(c_id)
c.stop()

c.stop()
63 changes: 42 additions & 21 deletions gpu_cluster/controllers/gpu_container_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,31 @@
from .container_controller import ContainerController
from nvdocker import NVDockerClient


class GPUContainerController(ContainerController):

def __init__(self, config):
super().__init__(config)
self.docker_client = NVDockerClient()
def create_container(image, user="", token_required=False, budget=-1, num_gpus=1):

def create_container(self, image, user="", token_required=False, budget=-1, num_gpus=1):
# Get 2 open ports for UI and Monitor
uport = self.get_port()
mport = self.get_port()
uport = super().get_port()
mport = super().get_port()
while uport == mport:
mport = self.get_port()
mport = super().get_port()

# Get select a gpu(s) that are least in use
num_available_gpus = len(docker_client.list_gpus())
num_available_gpus = len(NVDockerClient.gpu_info())
if num_gpus > num_available_gpus:
num_gpus = num_available_gpus

gpus = []
memory_usage = docker_client.gpu_memory_usage()
for g in num_gpus:
for gpu, used in memory_usage.items():
if used < memory_usage[gpu[-1]]:
gpus.append(gpu)
for g in range(num_gpus):
if NVDockerClient.gpu_memory_usage(g)["free_mb"] > 0:
gpus.append(g)

# Assemble config for container
# Assemble config for container
container_config = {
"ports": {
'8888/tcp': uport,
Expand All @@ -40,20 +39,42 @@ def create_container(image, user="", token_required=False, budget=-1, num_gpus=1
"auto_remove": True
}

#create container
c_id = docker_client.create_container(image, **container_config).id
# create container
container_list = self.docker_client.docker_image_list(filters={'name': image})
print(image)
if container_list:
c_id = self.docker_client.create_container(image, **container_config).id

else:
# Add a client.images.search to check if the path to the container exists on docker hub. If not, error out
has_result = self.docker_client.docker_image_search(image)
if not has_result:
print('No image in DockerHub')
return 'No image in DockerHub' , '', ''

image_tag = image.split(':')
docker_image = self.docker_client.docker_image_pull(image_tag[0], image_tag[1])

# If pull returns more than one image, get the first one in the list
if hasattr(docker_image, '__len__'):
docker_image = docker_image[0]
print(docker_image)

# Do you have to build the image after you pull it from Docker Hub?
c_id = self.docker_client.create_container(docker_image, **container_config).id

#assemble endpoints for UI, monitor and get the access token if needed
# assemble endpoints for UI, monitor and get the access token if needed
uurl = ""
murl = ""
token = ""
base_url = "http://{}".format(self.config["domain_name"])
if token_required:
token = docker_client.exec_run(c_id, 'python3 /opt/cluster-container/jupyter_get.py')
uurl = "http://vault.acm.illinois.edu:{}/?token={}".format(uport, token.decode("utf-8") )
murl = "http://vault.acm.illinois.edu:" + str(mport)
token = self.docker_client.exec_run(c_id, 'python3 /opt/cluster-container/jupyter_get.py')
uurl = "{}:{}/?token={}".format(base_url, uport, token.decode("utf-8") )
murl = base_url + str(mport)
else:
uurl = "http://vault.acm.illinois.edu:" + str(uport)
murl = "http://vault.acm.illinois.edu:" + str(mport)
uurl = base_url + str(uport)
murl = base_url + str(mport)

#TODO insert budget
budget = -1
Expand All @@ -62,4 +83,4 @@ def create_container(image, user="", token_required=False, budget=-1, num_gpus=1
return c_id, uurl, murl

def kill_container(self, c_id):
self.docker_client.stop_container(c_id)
self.docker_client.stop_container(c_id)
20 changes: 17 additions & 3 deletions gpu_cluster/routes/cluster_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from ..database import db_session
from ..models import Instance
from flask import Flask, jsonify, request, abort
from nvdocker import NVDockerClient
import socket

class ClusterAPI():
def __init__(self, controller):
Expand All @@ -19,6 +21,9 @@ def create_container(self):
abort(400)

cid, ui_url, murl = self.controller.create_container(request.json['image'], token_required=request.json['token_required'])#, user=request.json['user'], budget=request.json['budget'] )
if ui_url == '' or murl == '':
abort(400)

return jsonify({'cid': cid, 'ui_url' : ui_url, 'monitor_url': murl})

def confirm_launch(self):
Expand All @@ -29,13 +34,22 @@ def confirm_launch(self):

if launched == False:
return jsonify({"error" : "non-existant container"})
return jsonify({"verified" : "confirmed"})

return jsonify({"verified" : "confirmed"})
def kill_container(self):
pass

def status(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to and an api rule in register routes for flask to see it

hostname = socket.gethostname()
available_gpu = NVDockerClient.least_used_gpu()
response = {
"hostname" : hostname,
"gpu" : available_gpu
}
return jsonify(response)

def register_routes(self, app):
app.add_url_rule('/create_container', 'create_container', self.create_container, methods=['POST'])
app.add_url_rule('/confirm', 'confirm', self.confirm_launch, methods=['POST'])
app.add_url_rule('/kill_container', 'kill_container', self.kill_container, methods=['POST'])

app.add_url_rule('/status', 'status', self.status, methods=['GET'])
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ docker==2.5.1
flask-cors==3.0.3
PyYaml==3.12
celery==4.1.0
nvdocker==0.0.2a3
nvdocker==0.0.2a5