Skip to content

Commit faf06ee

Browse files
authored
fix: use al2 base image from ecr (#38)
* fix: use al2 from ecr * extend timeout in job
1 parent a6e444c commit faf06ee

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

scripts/build.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ source scripts/shared.sh
2222

2323
parse_std_args "$@"
2424

25+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 137112412989.dkr.ecr.us-west-2.amazonaws.com
26+
2527
echo "building image ${version} ... "
2628
docker build \
2729
-f ${build_context}/docker/Dockerfile.${processor} \
2830
-t ${repository}:${version} \
2931
--build-arg REGION=${REGION} \
3032
-t sagemaker-spark:latest \
3133
${build_context}
34+
35+
docker logout https://137112412989.dkr.ecr.us-west-2.amazonaws.com

spark/processing/3.0/py3/docker/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM amazonlinux:2
1+
FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2
22
ARG REGION
33
ENV AWS_REGION ${REGION}
44
RUN yum clean all

src/smspark/job.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
class ProcessingJobManager(object):
3333
"""Manages the lifecycle of a Spark job."""
3434

35+
_bootstrapping_timeout = 600.0 # all hosts should report as ready within this timeout.
36+
_wait_for_primary_timeout = 600.0 # then, all workers ask the primary if it's up within this timeout.
37+
3538
def __init__(
3639
self,
3740
resource_config: Dict[str, Any] = None, # type: ignore
@@ -136,7 +139,11 @@ def all_hosts_have_bootstrapped() -> bool:
136139
has_bootstrapped = [message.status == Status.WAITING for message in host_statuses.values()]
137140
return all(has_bootstrapped)
138141

139-
self.waiter.wait_for(predicate_fn=all_hosts_have_bootstrapped, timeout=180.0, period=5.0)
142+
self.waiter.wait_for(
143+
predicate_fn=all_hosts_have_bootstrapped,
144+
timeout=ProcessingJobManager._bootstrapping_timeout,
145+
period=5.0,
146+
)
140147

141148
try:
142149
subprocess.run(spark_submit_cmd, check=True, shell=True)
@@ -172,7 +179,7 @@ def primary_is_down() -> bool:
172179
return not primary_is_up()
173180

174181
self.logger.info("waiting for the primary to come up")
175-
self.waiter.wait_for(primary_is_up, timeout=60.0, period=1.0)
182+
self.waiter.wait_for(primary_is_up, timeout=ProcessingJobManager._wait_for_primary_timeout, period=1.0)
176183
self.logger.info("waiting for the primary to go down")
177184
self.waiter.wait_for(primary_is_down, timeout=float("inf"), period=5.0)
178185
self.logger.info("primary is down, worker now exiting")

0 commit comments

Comments
 (0)