Replies: 2 comments
-
|
Hi @w601sxs, the stacktrace you provided showed that the error was rooted with S3 permissions. Could you make sure that your role provide (usually "SageMakerRole") has permissions to the bucket where your entrypoint file is uploaded to? Also is your bucket encrypted? |
Beta Was this translation helpful? Give feedback.
0 replies
-
|
I am facing a similar issue. Has anyone resolved it ? |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Describe the bug
A clear and concise description of what the bug is.
When defining a custom estimator, remote training works but local training does not.
To reproduce
A clear, step-by-step set of instructions to reproduce the bug.
where
myEstimatoris from:Expected behavior
A clear and concise description of what you expected to happen.
local mode should work if remote works
Screenshots or logs
If applicable, add screenshots or logs to help explain your problem.
Creating tmpptzb0bfs_algo-1-g4d94_1 ... Attaching to tmpptzb0bfs_algo-1-g4d94_12mdone algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR Reporting training FAILURE algo-1-g4d94_1 | 2020-08-25 20:00:05,420 sagemaker-training-toolkit ERROR framework error: algo-1-g4d94_1 | Traceback (most recent call last): algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/trainer.py", line 92, in train algo-1-g4d94_1 | entry_point.run( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/entry_point.py", line 92, in run algo-1-g4d94_1 | files.download_and_extract(uri=uri, path=environment.code_dir) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 131, in download_and_extract algo-1-g4d94_1 | s3_download(uri, dst) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/sagemaker_training/files.py", line 167, in s3_download algo-1-g4d94_1 | s3.Bucket(bucket).download_file(key, dst) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 244, in bucket_download_file algo-1-g4d94_1 | return self.meta.client.download_file( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/inject.py", line 170, in download_file algo-1-g4d94_1 | return transfer.download_file( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/boto3/s3/transfer.py", line 307, in download_file algo-1-g4d94_1 | future.result() algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 106, in result algo-1-g4d94_1 | return self._coordinator.result() algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/futures.py", line 265, in result algo-1-g4d94_1 | raise self._exception algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/tasks.py", line 255, in _main algo-1-g4d94_1 | self._submit(transfer_future=transfer_future, **kwargs) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/s3transfer/download.py", line 340, in _submit algo-1-g4d94_1 | response = client.head_object( algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call algo-1-g4d94_1 | return self._make_api_call(operation_name, kwargs) algo-1-g4d94_1 | File "/miniconda3/lib/python3.8/site-packages/botocore/client.py", line 635, in _make_api_call algo-1-g4d94_1 | raise error_class(parsed_response, operation_name) algo-1-g4d94_1 | botocore.exceptions.ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden algo-1-g4d94_1 | algo-1-g4d94_1 | An error occurred (403) when calling the HeadObject operation: Forbidden tmpptzb0bfs_algo-1-g4d94_1 exited with code 1 Aborting on container exit... --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name) 160 try: --> 161 _stream_output(process) 162 except RuntimeError as e: ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in _stream_output(process) 676 if exit_code != 0: --> 677 raise RuntimeError("Process exited with code: %s" % exit_code) 678 RuntimeError: Process exited with code: 1 During handling of the above exception, another exception occurred: RuntimeError Traceback (most recent call last) <ipython-input-22-059e808d1544> in <module>() 10 train_config = sagemaker.session.s3_input(input_data, content_type='application/x-parquet') 11 ---> 12 local_framework.fit({'train':train_config}, logs=True) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config) 491 self._prepare_for_training(job_name=job_name) 492 --> 493 self.latest_training_job = _TrainingJob.start_new(self, inputs, experiment_config) 494 self.jobs.append(self.latest_training_job) 495 if wait: ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/estimator.py in start_new(cls, estimator, inputs, experiment_config) 1058 train_args["enable_sagemaker_metrics"] = estimator.enable_sagemaker_metrics 1059 -> 1060 estimator.sagemaker_session.train(**train_args) 1061 1062 return cls(estimator.sagemaker_session, estimator._current_job_name) ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/session.py in train(self, input_mode, input_config, role, job_name, output_config, resource_config, vpc_config, hyperparameters, stop_condition, tags, metric_definitions, enable_network_isolation, image, algorithm_arn, encrypt_inter_container_traffic, train_use_spot_instances, checkpoint_s3_uri, checkpoint_local_path, experiment_config, debugger_rule_configs, debugger_hook_config, tensorboard_output_config, enable_sagemaker_metrics) 588 LOGGER.info("Creating training-job with name: %s", job_name) 589 LOGGER.debug("train request: %s", json.dumps(train_request, indent=4)) --> 590 self.sagemaker_client.create_training_job(**train_request) 591 592 def process( ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/local_session.py in create_training_job(self, TrainingJobName, AlgorithmSpecification, OutputDataConfig, ResourceConfig, InputDataConfig, **kwargs) 100 hyperparameters = kwargs["HyperParameters"] if "HyperParameters" in kwargs else {} 101 logger.info("Starting training job") --> 102 training_job.start(InputDataConfig, OutputDataConfig, hyperparameters, TrainingJobName) 103 104 LocalSagemakerClient._training_jobs[TrainingJobName] = training_job ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/entities.py in start(self, input_data_config, output_data_config, hyperparameters, job_name) 94 95 self.model_artifacts = self.container.train( ---> 96 input_data_config, output_data_config, hyperparameters, job_name 97 ) 98 self.end_time = datetime.datetime.now() ~/anaconda3/envs/python3/lib/python3.6/site-packages/sagemaker/local/image.py in train(self, input_data_config, output_data_config, hyperparameters, job_name) 164 # which contains the exit code and append the command line to it. 165 msg = "Failed to run: %s, %s" % (compose_command, str(e)) --> 166 raise RuntimeError(msg) 167 finally: 168 artifacts = self.retrieve_artifacts(compose_data, output_data_config, job_name) RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmpptzb0bfs/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1Also fails when I
System information
A description of your system. Please provide:
Additional context
Add any other context about the problem here.
Beta Was this translation helpful? Give feedback.
All reactions