Skip to content

Commit c37c1f7

Browse files
authored
relaunch failed pod (#1563)
1 parent f8a8dbb commit c37c1f7

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

elasticdl/python/master/k8s_instance_manager.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def __init__(
7373
self._ps_pod_name_to_id = {}
7474
self._relaunch_deleted_live_ps = True
7575

76+
self._failed_pods = []
77+
7678
self._k8s_client = k8s.Client(event_callback=self._event_cb, **kwargs)
7779
self._ps_addrs = self._get_addrs(
7880
self._num_ps, self._k8s_client.get_ps_service_address
@@ -218,10 +220,18 @@ def _event_cb(self, event):
218220
worker_id = -1
219221
ps_id = -1
220222
with self._lock:
223+
if pod_name in self._failed_pods:
224+
return
221225
if pod_name in self._worker_pod_name_to_id:
222226
worker_id = self._worker_pod_name_to_id.get(pod_name)
223227
self._worker_pods_phase[worker_id] = (pod_name, phase)
224-
if evt_type == "DELETED":
228+
# Workaround for memory leak issues in tf eager mode.
229+
# A pod may fail due to OOM from tf eager mode memory leak.
230+
failed_pod = False
231+
if evt_type == "MODIFIED" and phase == "Failed":
232+
self._failed_pods.append(pod_name)
233+
failed_pod = True
234+
if evt_type == "DELETED" or failed_pod:
225235
del self._worker_pods_phase[worker_id]
226236
del self._worker_pod_name_to_id[pod_name]
227237
self._task_d.recover_tasks(worker_id)
@@ -235,7 +245,13 @@ def _event_cb(self, event):
235245
elif pod_name in self._ps_pod_name_to_id:
236246
ps_id = self._ps_pod_name_to_id.get(pod_name)
237247
self._ps_pods_phase[ps_id] = (pod_name, phase)
238-
if evt_type == "DELETED":
248+
# Workaround for memory leak issues in tf eager mode.
249+
# A pod may fail due to OOM from tf eager mode memory leak.
250+
failed_pod = False
251+
if evt_type == "MODIFIED" and phase == "Failed":
252+
self._failed_pods.append(pod_name)
253+
failed_pod = True
254+
if evt_type == "DELETED" or failed_pod:
239255
del self._ps_pods_phase[ps_id]
240256
del self._ps_pod_name_to_id[pod_name]
241257
relaunch_ps = self._relaunch_deleted_live_ps

0 commit comments

Comments
 (0)