Throw a Python exception if compilation fails.

zhanyongwan · zhanyongwan · commit 2d4b3b95e321 · 2025-05-10T00:25:18.000Z
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cc b/torch_xla/csrc/runtime/pjrt_computation_client.cc
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include <future>
+#include <stdexcept>
 #include <unordered_set>
 #include <vector>
 
@@ -625,21 +626,26 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
           device_assignment);
     }
 
-    std::unique_ptr<xla::PjRtLoadedExecutable> executable;
+    absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> maybe_executable;
     if (runtime::sys_util::GetEnvBool("XLA_STABLEHLO_COMPILE", false)) {
       // Convert HLO to StableHLO for PjRt client compilation.
       mlir::MLIRContext context;
       mlir::ModuleOp mlir_module =
           mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
       ConvertHloToStableHlo(instance.computation.mutable_proto(), &mlir_module);
-      executable =
-          client_->CompileAndLoad(mlir_module, compile_options).value();
+      maybe_executable = client_->CompileAndLoad(mlir_module, compile_options);
       StableHloCompileCounter()->AddValue(1);
     } else {
-      executable =
-          client_->CompileAndLoad(instance.computation, compile_options)
-              .value();
+      maybe_executable =
+          client_->CompileAndLoad(instance.computation, compile_options);
     }
+    if (!maybe_executable.ok()) {
+      // This will automatically raise a Python ValueError exception.
+      // See https://pybind11.readthedocs.io/en/stable/advanced/exceptions.html.
+      throw std::invalid_argument(
+          std::string(maybe_executable.status().message()));
+    }
+    auto executable = std::move(maybe_executable).value();
 
     auto memory_stats_status_or = executable->GetCompiledMemoryStats();
     if (memory_stats_status_or.ok()) {
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client_test.cc b/torch_xla/csrc/runtime/pjrt_computation_client_test.cc
@@ -3,6 +3,7 @@
 #include <gtest/gtest.h>
 
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -34,6 +35,27 @@ absl::StatusOr<xla::XlaComputation> MakeComputation() {
   return builder.Build();
 }
 
+TEST(PjRtComputationClient, ThrowsExpectedExceptionWhenCompileFails) {
+  // Get a CPU client.
+  tsl::setenv("PJRT_DEVICE", "CPU", true);
+  const auto client = std::make_unique<PjRtComputationClient>();
+  const std::string device = client->GetDefaultDevice();
+
+  // Compose a computation with an enormous shape.
+  const auto shape =
+      xla::ShapeUtil::MakeShape(xla::F32, {8000000000, 1000000000});
+  std::vector<ComputationClient::CompileInstance> instances;
+  instances.push_back(ComputationClient::CompileInstance(
+      std::move(MakeComputation().value()), device,
+      client->GetCompilationDevices(device, client->GetLocalDevices()),
+      &shape));
+
+  // Compiling the graph should fail, which should throw instead of crashing.
+  // TODO(https://github.com/pytorch/xla/issues/9096): ensure that
+  // the exception has type std::invalid_argument.
+  EXPECT_ANY_THROW(client->Compile(std::move(instances)));
+}
+
 TEST(PjRtComputationClientTest, Init) {
   // Get a CPU client.
   tsl::setenv("PJRT_DEVICE", "CPU", true);
@@ -69,7 +91,7 @@ TEST(PjRtComputationClientTest, Init) {
       *computations[0], client->TransferToDevice(absl::MakeConstSpan(args)),
       device, options);
 
-  // Copy the output from device back to host and assert correctness..
+  // Copy the output from device back to host and assert correctness.
   ASSERT_EQ(results.size(), 1);
   auto result_literals = client->TransferFromDevice(results);
   ASSERT_THAT(result_literals, ::testing::SizeIs(1));