diff --git a/app.py b/app.py index bef1b48d..9f5c131e 100644 --- a/app.py +++ b/app.py @@ -149,12 +149,17 @@ r = requests.patch(trial_url, data={"status": "error"}, headers = {"Authorization": "Token {}".format(API_TOKEN)}) traceback.print_exc() - args_as_strings = [str(arg) for arg in e.args] - if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower(): - logging.info("Worker failed. Stopping machine.") - message = "A backend OpenCap machine timed out during pose detection. It has been stopped." - sendStatusEmail(message=message) - raise Exception('Worker failed. Stopped.') + + # Antoine: Removing this, it is too often causing the machines to stop. Not because + # the machines are failing, but because for instance the video is very long with a lot + # of people in it. We should not stop the machine for that. Originally the check was + # to catch a bug where the machine would hang, I have not seen this bug in a long time. + # args_as_strings = [str(arg) for arg in e.args] + # if len(args_as_strings) > 1 and 'pose detection timed out' in args_as_strings[1].lower(): + # logging.info("Worker failed. Stopping machine.") + # message = "A backend OpenCap machine timed out during pose detection. It has been stopped." + # sendStatusEmail(message=message) + # raise Exception('Worker failed. Stopped.') justProcessed = True # Clean data directory diff --git a/utils.py b/utils.py index e45a1bba..f763f4ca 100644 --- a/utils.py +++ b/utils.py @@ -1494,6 +1494,10 @@ def sendStatusEmail(message=None,subject=None): emailInfo = getStatusEmails() if emailInfo is None: return('No email info or wrong email info in env file.') + + if 'ip' in emailInfo: + ip = emailInfo['ip'] + message = message + ' IP: ' + ip if message is None: message = "A backend server is down and has been stopped." @@ -1533,8 +1537,7 @@ def checkResourceUsage(stop_machine_and_email=True): if stop_machine_and_email and resourceUsage['disk_perc'] > 95: - message = "Disc is full on an OpenCap machine backend machine: " \ - + socket.gethostname() + ". It has been stopped. Data: " \ + message = "Disc is full on an OpenCap backend machine. It has been stopped. Data: " \ + json.dumps(resourceUsage) sendStatusEmail(message=message) @@ -1551,8 +1554,7 @@ def checkCudaTF(): for gpu in gpus: print(f"GPU: {gpu.name}") else: - message = "Cuda check failed on an OpenCap machine backend machine: " \ - + socket.gethostname() + ". It has been stopped." + message = "Cuda check failed on an OpenCap backend machine. It has been stopped." sendStatusEmail(message=message) raise Exception("No GPU detected. Exiting.") diff --git a/utilsAPI.py b/utilsAPI.py index cf0e837c..982bf4f7 100644 --- a/utilsAPI.py +++ b/utilsAPI.py @@ -41,6 +41,10 @@ def getStatusEmails(): emailInfo['toEmails'] = json.loads(config("STATUS_EMAIL_TO")) except: emailInfo = None + try: + emailInfo['ip'] = config("STATUS_EMAIL_IP") + except: + pass return emailInfo diff --git a/utilsServer.py b/utilsServer.py index 352a8419..f97877c5 100644 --- a/utilsServer.py +++ b/utilsServer.py @@ -4,7 +4,6 @@ import requests import json import logging -import socket from main import main from utils import getDataDirectory @@ -469,7 +468,7 @@ def runTestSession(pose='all',isDocker=True): except: logging.info("test trial failed. stopping machine.") # send email - message = "A backend OpenCap machine failed the status check: " + socket.gethostname() + ". It has been stopped." + message = "A backend OpenCap machine failed the status check. It has been stopped." sendStatusEmail(message=message) raise Exception('Failed status check. Stopped.')