From 650f246fa4a163d48266a1046de18ee3764878f5 Mon Sep 17 00:00:00 2001 From: Antoine Falisse Date: Fri, 5 Jul 2024 12:01:28 -0700 Subject: [PATCH 1/4] remove time out kill + improve message --- app.py | 12 ++++++------ utils.py | 12 +++++++----- utilsAPI.py | 1 + utilsServer.py | 3 +-- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/app.py b/app.py index bef1b48d..234a1f6c 100644 --- a/app.py +++ b/app.py @@ -149,12 +149,12 @@ r = requests.patch(trial_url, data={"status": "error"}, headers = {"Authorization": "Token {}".format(API_TOKEN)}) traceback.print_exc() - args_as_strings = [str(arg) for arg in e.args] - if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower(): - logging.info("Worker failed. Stopping machine.") - message = "A backend OpenCap machine timed out during pose detection. It has been stopped." - sendStatusEmail(message=message) - raise Exception('Worker failed. Stopped.') + # args_as_strings = [str(arg) for arg in e.args] + # if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower(): + # logging.info("Worker failed. Stopping machine.") + # message = "A backend OpenCap machine timed out during pose detection. It has been stopped." + # sendStatusEmail(message=message) + # raise Exception('Worker failed. Stopped.') justProcessed = True # Clean data directory diff --git a/utils.py b/utils.py index e45a1bba..57fb718f 100644 --- a/utils.py +++ b/utils.py @@ -1494,9 +1494,13 @@ def sendStatusEmail(message=None,subject=None): emailInfo = getStatusEmails() if emailInfo is None: return('No email info or wrong email info in env file.') + + if 'ip' in emailInfo: + ip = emailInfo['ip'] + message = message + ' IP: ' + ip if message is None: - message = "A backend server is down and has been stopped." + message = "A backend server is down and has been stopped.".format(ip) if subject is None: subject = "OpenCap backend server down" @@ -1533,8 +1537,7 @@ def checkResourceUsage(stop_machine_and_email=True): if stop_machine_and_email and resourceUsage['disk_perc'] > 95: - message = "Disc is full on an OpenCap machine backend machine: " \ - + socket.gethostname() + ". It has been stopped. Data: " \ + message = "Disc is full on an OpenCap machine backend machine. It has been stopped. Data: " \ + json.dumps(resourceUsage) sendStatusEmail(message=message) @@ -1551,8 +1554,7 @@ def checkCudaTF(): for gpu in gpus: print(f"GPU: {gpu.name}") else: - message = "Cuda check failed on an OpenCap machine backend machine: " \ - + socket.gethostname() + ". It has been stopped." + message = "Cuda check failed on an OpenCap machine backend machine. It has been stopped." sendStatusEmail(message=message) raise Exception("No GPU detected. Exiting.") diff --git a/utilsAPI.py b/utilsAPI.py index cf0e837c..3f83c514 100644 --- a/utilsAPI.py +++ b/utilsAPI.py @@ -39,6 +39,7 @@ def getStatusEmails(): emailInfo['fromEmail'] = config("STATUS_EMAIL_FROM") emailInfo['password'] = config("STATUS_EMAIL_FROM_PW") emailInfo['toEmails'] = json.loads(config("STATUS_EMAIL_TO")) + emailInfo['ip'] = json.loads(config("STATUS_EMAIL_IP")) except: emailInfo = None diff --git a/utilsServer.py b/utilsServer.py index 352a8419..f97877c5 100644 --- a/utilsServer.py +++ b/utilsServer.py @@ -4,7 +4,6 @@ import requests import json import logging -import socket from main import main from utils import getDataDirectory @@ -469,7 +468,7 @@ def runTestSession(pose='all',isDocker=True): except: logging.info("test trial failed. stopping machine.") # send email - message = "A backend OpenCap machine failed the status check: " + socket.gethostname() + ". It has been stopped." + message = "A backend OpenCap machine failed the status check. It has been stopped." sendStatusEmail(message=message) raise Exception('Failed status check. Stopped.') From d20f7fa064bcdbdacc7c83ae56083f6796ce91c7 Mon Sep 17 00:00:00 2001 From: Antoine Falisse Date: Fri, 5 Jul 2024 12:02:58 -0700 Subject: [PATCH 2/4] minor --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 57fb718f..eb3cb589 100644 --- a/utils.py +++ b/utils.py @@ -1500,7 +1500,7 @@ def sendStatusEmail(message=None,subject=None): message = message + ' IP: ' + ip if message is None: - message = "A backend server is down and has been stopped.".format(ip) + message = "A backend server is down and has been stopped." if subject is None: subject = "OpenCap backend server down" From 356195bc0beb2271c51677b6a30c4306462e83ea Mon Sep 17 00:00:00 2001 From: Antoine Falisse Date: Fri, 5 Jul 2024 12:05:34 -0700 Subject: [PATCH 3/4] minor --- app.py | 5 +++++ utils.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 234a1f6c..9f5c131e 100644 --- a/app.py +++ b/app.py @@ -149,6 +149,11 @@ r = requests.patch(trial_url, data={"status": "error"}, headers = {"Authorization": "Token {}".format(API_TOKEN)}) traceback.print_exc() + + # Antoine: Removing this, it is too often causing the machines to stop. Not because + # the machines are failing, but because for instance the video is very long with a lot + # of people in it. We should not stop the machine for that. Originally the check was + # to catch a bug where the machine would hang, I have not seen this bug in a long time. # args_as_strings = [str(arg) for arg in e.args] # if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower(): # logging.info("Worker failed. Stopping machine.") diff --git a/utils.py b/utils.py index eb3cb589..f763f4ca 100644 --- a/utils.py +++ b/utils.py @@ -1537,7 +1537,7 @@ def checkResourceUsage(stop_machine_and_email=True): if stop_machine_and_email and resourceUsage['disk_perc'] > 95: - message = "Disc is full on an OpenCap machine backend machine. It has been stopped. Data: " \ + message = "Disc is full on an OpenCap backend machine. It has been stopped. Data: " \ + json.dumps(resourceUsage) sendStatusEmail(message=message) @@ -1554,7 +1554,7 @@ def checkCudaTF(): for gpu in gpus: print(f"GPU: {gpu.name}") else: - message = "Cuda check failed on an OpenCap machine backend machine. It has been stopped." + message = "Cuda check failed on an OpenCap backend machine. It has been stopped." sendStatusEmail(message=message) raise Exception("No GPU detected. Exiting.") From fd46947e6ede1887c3f34fe7cdf38cad9c7adc6d Mon Sep 17 00:00:00 2001 From: Antoine Falisse Date: Fri, 5 Jul 2024 12:36:36 -0700 Subject: [PATCH 4/4] fixing bug --- utilsAPI.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utilsAPI.py b/utilsAPI.py index 3f83c514..982bf4f7 100644 --- a/utilsAPI.py +++ b/utilsAPI.py @@ -39,9 +39,12 @@ def getStatusEmails(): emailInfo['fromEmail'] = config("STATUS_EMAIL_FROM") emailInfo['password'] = config("STATUS_EMAIL_FROM_PW") emailInfo['toEmails'] = json.loads(config("STATUS_EMAIL_TO")) - emailInfo['ip'] = json.loads(config("STATUS_EMAIL_IP")) except: emailInfo = None + try: + emailInfo['ip'] = config("STATUS_EMAIL_IP") + except: + pass return emailInfo