Open
Description
During initialization, the work may fail because of network issue.
The pod status is error
. No new pod will be re-launched by the master pod.
The trace is as follows:
Traceback (most recent call last):
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/elasticdl/python/worker/main.py", line 44, in <module>
main()
File "/elasticdl/python/worker/main.py", line 38, in main
set_parallelism=True,
File "/elasticdl/python/worker/worker.py", line 116, in __init__
self._init_from_args(args)
File "/elasticdl/python/worker/worker.py", line 158, in _init_from_args
self.set_model(model_inst)
File "/elasticdl/python/worker/worker.py", line 206, in set_model
self._init_embeddings()
File "/elasticdl/python/worker/worker.py", line 269, in _init_embeddings
self.report_embedding_info()
File "/elasticdl/python/worker/worker.py", line 429, in report_embedding_info
self._ps_stubs[ps_id].push_embedding_info(model)
File "/usr/local/lib/python3.6/dist-packages/grpc/_channel.py", line 826, in __call__
return _end_unary_response_blocking(state, call, False, None)
File "/usr/local/lib/python3.6/dist-packages/grpc/_channel.py", line 729, in _end_unary_response_blocking
raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "failed to connect to all addresses"
debug_error_string = "{"created":"@1583216331.257369082","description":"Failed to pick subchannel","file":"src/core/ext/filters/client_channel/client_channel.cc","file_line":3941,"referenced_errors":[{"created":"@1583216331.257364550","description":"failed to connect to all addresses","file":"src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":393,"grpc_status":14}]}"