Replace GetValueOrThrow with status propagation in ReleaseGilAndTransferData

ysiraichi · ysiraichi · commit 1e09ef18445e · 2025-07-03T12:42:48.000-03:00
Modify `ReleaseGilAndTransferData` function to use proper status propagation
instead of `GetValueOrThrow` with `GetComputationClientOrDie`. This improves
error handling by allowing status types to be propagated up the call stack
rather than immediately throwing exceptions.

Changes:
- Update function signature to return `absl::StatusOr&lt;std::vector&lt;xla::Literal&gt;&gt;`
- Replace `GetComputationClientOrDie()` with `GetComputationClient()`
- Use `XLA_ASSIGN_OR_RETURN` macros for both client acquisition and `TransferFromDevice`
- Update callers in tensor_util.cpp and xla_graph_executor.cpp to handle `StatusOr&lt;T&gt;`

This follows the status propagation patterns used elsewhere in the codebase
and aligns with the examples in pjrt_registry.cpp.
diff --git a/torch_xla/csrc/tensor_util.cpp b/torch_xla/csrc/tensor_util.cpp
@@ -896,7 +896,7 @@ xla::Literal GetTensorLiteral(const at::Tensor& tensor, const xla::Shape* shape,
   return literal;
 }
 
-std::vector<xla::Literal> ReleaseGilAndTransferData(
+absl::StatusOr<std::vector<xla::Literal>> ReleaseGilAndTransferData(
     absl::Span<const torch::lazy::BackendDataPtr> xla_data) {
   // HACK: This method may be called outside of python (mainly in C++ tests) or
   // when the GIL is already released, so we must check both cases here. If
@@ -909,9 +909,12 @@ std::vector<xla::Literal> ReleaseGilAndTransferData(
   if (release_gil && Py_IsInitialized() && PyGILState_Check()) {
     save = PyEval_SaveThread();
   }
-  std::vector<xla::Literal> literals =
-      GetValueOrThrow(runtime::GetComputationClientOrDie()->TransferFromDevice(
-          UnwrapXlaData(xla_data)));
+
+  XLA_ASSIGN_OR_RETURN(runtime::ComputationClient * client,
+                       runtime::GetComputationClient());
+  XLA_ASSIGN_OR_RETURN(std::vector<xla::Literal> literals,
+                       client->TransferFromDevice(UnwrapXlaData(xla_data)));
+
   if (save) {
     PyEval_RestoreThread(save);
   }
@@ -922,7 +925,8 @@ std::vector<xla::Literal> ReleaseGilAndTransferData(
 std::vector<at::Tensor> XlaDataToTensors(
     absl::Span<const torch::lazy::BackendDataPtr> xla_data,
     absl::Span<const at::ScalarType> dest_element_type) {
-  std::vector<xla::Literal> literals = ReleaseGilAndTransferData(xla_data);
+  std::vector<xla::Literal> literals =
+      GetValueOrThrow(ReleaseGilAndTransferData(xla_data));
   std::vector<at::Tensor> tensors(literals.size());
   absl::BlockingCounter counter(literals.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
diff --git a/torch_xla/csrc/tensor_util.h b/torch_xla/csrc/tensor_util.h
@@ -28,7 +28,7 @@ at::Tensor MakeTensorFromXlaLiteral(const xla::Literal& literal,
 // Execution and data transfer are async in PJRT, so TransferFromDevice may
 // block until `DataPtr`s are ready. Release the GIL so other threads can
 // proceed and unblock any transfers or collective computations.
-std::vector<xla::Literal> ReleaseGilAndTransferData(
+absl::StatusOr<std::vector<xla::Literal>> ReleaseGilAndTransferData(
     absl::Span<const torch::lazy::BackendDataPtr> xla_data);
 
 // TODO LTC @wonjoo - Migrate to upstream after Device -> BackendDevice
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -496,7 +496,8 @@ std::vector<at::Tensor> XLAGraphExecutor::GetTensors(
       async != nullptr ? async->tensors_data
                        : absl::Span<const torch::lazy::BackendDataPtr>());
 
-  std::vector<xla::Literal> literals = ReleaseGilAndTransferData(tensors_data);
+  std::vector<xla::Literal> literals =
+      GetValueOrThrow(ReleaseGilAndTransferData(tensors_data));
 
   return FetchTensors(tensors, literals,
                       async != nullptr ? &async->indices : nullptr);