[MPS] Write/Invoke Metal shaders from C++ (pytorch#141547)

malfet · pytorchmergebot · commit 9f9105a67b45 · 2024-12-02T23:57:59.000Z
By introducing `DynamicMetalShaderLibrary` and `MetalShaderFunction` Add unittests that also serves as an example of how API works Using this primitive, one can compile and dispatch any 1D or 2D shader over MPS tensor using the following pattern ```cpp auto x = torch::empty({8, 16}, at::device(at::kMPS)); DynamicMetalShaderLibrary lib(R"MTL( kernel void full(device float* t, constant ulong2& strides, uint2 idx [[thread_position_in_grid]]) { t[idx.x*strides.x + idx.y*strides.y] = idx.x + 33.0 * idx.y; } )MTL"); auto func = lib.getKernelFunction("full"); func->runCommandBlock([&] { func->startEncoding(); func->setArg(0, x); func->setArg(1, x.strides()); func->dispatch({8, 16}); }); ``` Pull Request resolved: pytorch#141547 Approved by: https://github.com/Skylion007
diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -4,17 +4,90 @@
 typedef id<MTLLibrary> MTLLibrary_t;
 typedef id<MTLFunction> MTLFunction_t;
 typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
 #else
 typedef void MTLCompileOptions;
 typedef void* MTLLibrary_t;
 typedef void* MTLFunction_t;
 typedef void* MTLComputePipelineState_t;
+typedef void* MTLComputeCommandEncoder_t;
 #endif
 
+#include <functional>
+#include <optional>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
+// Forward declaration of TensorBase
+namespace at {
+class TensorBase;
+}
+
 namespace at::native::mps {
+
+namespace detail {
+template <typename T>
+class has_size_type {
+  template <typename U>
+  static constexpr std::true_type check(typename U::size_type*);
+  template <typename>
+  static constexpr std::false_type check(...);
+
+ public:
+  static constexpr bool value = decltype(check<T>(nullptr))::value;
+};
+
+template <typename T>
+constexpr bool has_size_type_v = has_size_type<T>::value;
+
+} // namespace detail
+
+class MetalKernelFunction {
+ public:
+  MetalKernelFunction(MTLComputePipelineState_t cps_);
+  ~MetalKernelFunction();
+  MetalKernelFunction(MetalKernelFunction&) = delete;
+  // Shader properties
+  uint64_t getMaxThreadsPerThreadgroup() const;
+  uint64_t getThreadExecutionWidth() const;
+  uint64_t getStaticThreadGroupMemoryLength() const;
+  void runCommandBlock(std::function<void(void)> f);
+  // Methods below should be called from runCommandBlock functionT
+  void startEncoding();
+  void setArg(unsigned idx, const at::TensorBase& t);
+  void setArg(unsigned idx, const void* ptr, uint64_t size);
+  template <
+      typename T,
+      typename = std::enable_if_t<
+          std::is_integral_v<T> || std::is_same_v<T, float> ||
+          (std::is_class_v<T> && std::is_trivially_copyable_v<T> &&
+           !detail::has_size_type_v<T>)>>
+  inline void setArg(unsigned idx, const T val) {
+    setArg(idx, &val, sizeof(T));
+  }
+
+  template <
+      typename Container,
+      typename = std::enable_if_t<detail::has_size_type_v<Container>>>
+  inline void setArg(unsigned idx, const Container& values) {
+    setArg(
+        idx,
+        values.data(),
+        values.size() * sizeof(typename Container::value_type));
+  }
+  void dispatch(
+      uint64_t length,
+      std::optional<uint64_t> groupSize = std::nullopt);
+  void dispatch(
+      std::array<uint64_t, 2> length,
+      std::optional<std::array<uint64_t, 2>> groupSize = std::nullopt);
+
+ private:
+  MTLComputePipelineState_t cps;
+  MTLComputeCommandEncoder_t encoder = nullptr;
+};
+
 class MetalShaderLibrary {
  public:
   MetalShaderLibrary(const std::string& src)
@@ -31,6 +104,8 @@ class MetalShaderLibrary {
   MetalShaderLibrary(const MetalShaderLibrary&) = delete;
   virtual ~MetalShaderLibrary() = default;
   std::vector<std::string> getFunctionNames();
+  std::shared_ptr<MetalKernelFunction> getKernelFunction(
+      const std::string& name);
   inline MTLComputePipelineState_t getPipelineStateForFunc(
       const std::string& fname) {
     return getLibraryPipelineState(getLibrary(), fname).first;
@@ -71,4 +146,13 @@ class MetalShaderLibrary {
       cplMap;
 };
 
+class DynamicMetalShaderLibrary : public MetalShaderLibrary {
+ public:
+  DynamicMetalShaderLibrary(const std::string& src) : MetalShaderLibrary(src) {
+    // Compile right away
+    getLibrary();
+  }
+  ~DynamicMetalShaderLibrary();
+};
+
 } // namespace at::native::mps
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
@@ -343,23 +343,6 @@ inline bool is_dense_in_storage(const TensorBase& t) {
   return compute_storage_numel_distance(t) == static_cast<size_t>(t.numel());
 }
 
-namespace detail {
-template <typename T>
-class has_size_type {
-  template <typename U>
-  static constexpr std::true_type check(typename U::size_type*);
-  template <typename>
-  static constexpr std::false_type check(...);
-
- public:
-  static constexpr bool value = decltype(check<T>(nullptr))::value;
-};
-
-template <typename T>
-constexpr bool has_size_type_v = has_size_type<T>::value;
-
-} // namespace detail
-
 template <typename encoder_t,
           typename = std::enable_if_t<std::is_same_v<id<MTLComputeCommandEncoder>, encoder_t> ||
                                       std::is_same_v<id<MTLArgumentEncoder>, encoder_t>>>
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -1,4 +1,7 @@
 //  Copyright © 2022 Apple Inc.
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/mps/MetalShaderLibrary.h>
+#include <functional>
 #include <stdexcept>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/TensorIterator.h>
@@ -868,6 +871,10 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   return rc;
 }
 
+std::shared_ptr<MetalKernelFunction> MetalShaderLibrary::getKernelFunction(const std::string& name) {
+  return std::make_shared<MetalKernelFunction>(getPipelineStateForFunc(name));
+}
+
 class BundledShaderLibary : public MetalShaderLibrary {
  public:
   BundledShaderLibary() : MetalShaderLibrary("") {}
@@ -916,4 +923,63 @@ static dispatch_data_t getSectionData(const std::string& name) {
   return l;
 }
 
+// DynamicMetalShaderLibrary implementation
+DynamicMetalShaderLibrary::~DynamicMetalShaderLibrary() {
+  [library release];
+}
+
+// MetalKernelFunction implementation
+MetalKernelFunction::MetalKernelFunction(MTLComputePipelineState_t cps_) : cps([cps_ retain]) {}
+
+MetalKernelFunction::~MetalKernelFunction() {
+  [cps release];
+}
+
+void MetalKernelFunction::runCommandBlock(std::function<void(void)> run) {
+  dispatch_sync_with_rethrow(getCurrentMPSStream()->queue(), ^() {
+    @autoreleasepool {
+      run();
+    }
+  });
+}
+
+void MetalKernelFunction::startEncoding() {
+  encoder = getCurrentMPSStream()->commandEncoder();
+  [encoder setComputePipelineState:cps];
+}
+
+void MetalKernelFunction::dispatch(uint64_t length, std::optional<uint64_t> group_size) {
+  auto group_size_val = group_size.value_or(std::min(length, getMaxThreadsPerThreadgroup()));
+  [encoder dispatchThreads:MTLSizeMake(length, 1, 1) threadsPerThreadgroup:MTLSizeMake(group_size_val, 1, 1)];
+}
+
+void MetalKernelFunction::dispatch(std::array<uint64_t, 2> length, std::optional<std::array<uint64_t, 2>> group_size) {
+  auto group_size_val =
+      group_size.value_or(std::array<uint64_t, 2>{std::min(length[0], getMaxThreadsPerThreadgroup()), 1});
+  [encoder dispatchThreads:MTLSizeMake(length[0], length[1], 1)
+      threadsPerThreadgroup:MTLSizeMake(group_size_val[0], group_size_val[1], 1)];
+}
+
+void MetalKernelFunction::setArg(unsigned idx, const at::TensorBase& t) {
+  TORCH_CHECK(t.device().type() == kMPS, "Tensor must be on GPU");
+  mtl_setBuffer(encoder, t, idx);
+}
+
+void MetalKernelFunction::setArg(unsigned idx, const void* ptr, uint64_t size) {
+  TORCH_CHECK(size > 0);
+  [encoder setBytes:ptr length:size atIndex:idx];
+}
+
+uint64_t MetalKernelFunction::getMaxThreadsPerThreadgroup() const {
+  return [cps maxTotalThreadsPerThreadgroup];
+}
+
+uint64_t MetalKernelFunction::getThreadExecutionWidth() const {
+  return [cps threadExecutionWidth];
+}
+
+uint64_t MetalKernelFunction::getStaticThreadGroupMemoryLength() const {
+  return [cps staticThreadgroupMemoryLength];
+}
+
 } // namespace at::native::mps
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
@@ -110,7 +110,8 @@ list(APPEND ATen_VEC_TEST_SRCS
 
 list(APPEND ATen_MPS_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_print.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_allocator.cpp)
+  ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_allocator.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_metal_library.cpp)
 if(APPLE AND USE_MPS)
   list(APPEND ATen_MPS_TEST_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/mps_test_objc_interface.mm)
diff --git a/aten/src/ATen/test/mps_test_metal_library.cpp b/aten/src/ATen/test/mps_test_metal_library.cpp
@@ -0,0 +1,71 @@
+#include <gtest/gtest.h>
+#include <stdexcept>
+#include <torch/torch.h>
+#include <ATen/native/mps/MetalShaderLibrary.h>
+
+using namespace at::native::mps;
+TEST(MPSTestMetalLibrary, ShaderCreation) {
+   MetalShaderLibrary lib("// Empty library");
+   ASSERT_EQ(lib.getFunctionNames().size(), 0);
+}
+
+TEST(MPSTestMetalLibrary, SyntaxErrorThrows) {
+  ASSERT_THROW(new DynamicMetalShaderLibrary("printf(x);"), c10::Error);
+}
+
+TEST(MPSTestMetalLibrary, ArangeShader) {
+  auto y = torch::arange(10.0, at::device(at::kMPS));
+  auto x = torch::empty(10, at::device(at::kMPS));
+  DynamicMetalShaderLibrary lib(R"MTL(
+  kernel void foo(device float* t, uint idx [[thread_position_in_grid]]) {
+    t[idx] = idx;
+  }
+  )MTL");
+  auto func = lib.getKernelFunction("foo");
+  func->runCommandBlock([&] {
+     func->startEncoding();
+     func->setArg(0, x);
+     func->dispatch(x.numel());
+  });
+  ASSERT_TRUE((x==y).all().item().toBool());
+}
+
+TEST(MPSTestMetalLibrary, ArangeWithArgsShader) {
+  const auto size = 10;
+  const float start = .25;
+  const float step = .4;
+  auto x = torch::empty(size, at::device(at::kMPS));
+  auto y = torch::arange(start, start + size * step, step, at::device(at::kMPS));
+  ASSERT_EQ(x.numel(), y.numel());
+  DynamicMetalShaderLibrary lib(R"MTL(
+  kernel void foo(device float* t, constant float& start, constant float& step, uint idx [[thread_position_in_grid]]) {
+    t[idx] = start + idx * step;
+  }
+  )MTL");
+  auto func = lib.getKernelFunction("foo");
+  func->runCommandBlock([&] {
+     func->startEncoding();
+     func->setArg(0, x);
+     func->setArg(1, start);
+     func->setArg(2, step);
+     func->dispatch(x.numel());
+  });
+  ASSERT_TRUE((x==y).all().item().toBool());
+}
+TEST(MPSTestMetalLibrary, Arange2DShader) {
+  const auto size = 16;
+  auto x = torch::empty({size, size}, at::device(at::kMPS));
+  DynamicMetalShaderLibrary lib(R"MTL(
+  kernel void full(device float* t, constant ulong2& strides, uint2 idx [[thread_position_in_grid]]) {
+    t[idx.x*strides.x + idx.y*strides.y] = idx.x + 33.0 * idx.y;
+  }
+  )MTL");
+  auto func = lib.getKernelFunction("full");
+  func->runCommandBlock([&] {
+     func->startEncoding();
+     func->setArg(0, x);
+     func->setArg(1, x.strides());
+     func->dispatch({static_cast<uint64_t>(x.size(0)), static_cast<uint64_t>(x.size(1))});
+  });
+  ASSERT_EQ(x.sum().item().to<int>(), 65280);
+}