From de8dfe761d28177bcc42e3b26488438779721ac9 Mon Sep 17 00:00:00 2001 From: Lingjie Date: Wed, 26 Jun 2024 20:46:38 +0800 Subject: [PATCH] perf(torch): fast but unsafe buildATen & eliminating dispatches (#1271) * change at::xx_out to at:;cuda::xx_out * perf: superfast but unsafe buildaten * modify func_ext buildaten * build(torch): add option to switch unsafe buildATen * style: format cpp * docs(torch): refine naming and docs --------- Co-authored-by: CoolKbh --- .clang-tidy | 2 + impl/torch/CMakeLists.txt | 7 +- impl/torch/build_aten.cpp | 154 +++ impl/torch/build_aten.hpp | 99 ++ impl/torch/functions/functions.cpp | 1588 ++++++++++++------------ impl/torch/functions/functions_ext.cpp | 10 +- impl/torch/helper.cpp | 98 +- impl/torch/helper.hpp | 15 +- 8 files changed, 1061 insertions(+), 912 deletions(-) create mode 100644 impl/torch/build_aten.cpp create mode 100644 impl/torch/build_aten.hpp diff --git a/.clang-tidy b/.clang-tidy index 3fb2391ede..08e1675705 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -50,6 +50,8 @@ CheckOptions: value: "_" - key: readability-identifier-naming.ParameterCase value: "camelBack" + - key: readability-identifier-naming.ParameterIgnoredRegexp + value: "^([a-z]+_)*[a-z]+$" - key: readability-identifier-naming.UnionCase value: "camelBack" - key: readability-identifier-naming.VariableCase diff --git a/impl/torch/CMakeLists.txt b/impl/torch/CMakeLists.txt index c97c79fb95..5419defb11 100644 --- a/impl/torch/CMakeLists.txt +++ b/impl/torch/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.14) project(torch_impl) option(HIP "Whether to use HIP when available" OFF) +option(DIOPI_TORCH_UNSAFE_BUILDATEN "Whether to use fast but unsafe buildATen (caution: only use this with DIPU)" OFF) include(cmake/TorchBaseFunc.cmake) InitFindTorch() @@ -32,7 +33,7 @@ if (DYLOAD) set(IMPL_SRC wrap_func.cpp) endif() -file(GLOB REAL_IMPL_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} functions/functions_mmcv/*.cu functions/functions_ext/*.cu functions/*.cpp helper.cpp) +file(GLOB REAL_IMPL_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} functions/functions_mmcv/*.cu functions/functions_ext/*.cu functions/*.cpp helper.cpp build_aten.cpp) # adaptor set(USE_ADAPTOR ON) @@ -104,6 +105,10 @@ if(USE_ADAPTOR) add_dependencies(${DEVICEIMPL} adaptor_code_gen) endif() +if(DIOPI_TORCH_UNSAFE_BUILDATEN) + target_compile_definitions(${DEVICEIMPL} PRIVATE DIOPI_TORCH_UNSAFE_BUILDATEN) +endif() + if (TEST) add_subdirectory(test) endif() diff --git a/impl/torch/build_aten.cpp b/impl/torch/build_aten.cpp new file mode 100644 index 0000000000..10fa51423d --- /dev/null +++ b/impl/torch/build_aten.cpp @@ -0,0 +1,154 @@ +#include "build_aten.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "helper.hpp" + +namespace impl::aten { + +UnsafelyDeviceChangedTensorWrapper::UnsafelyDeviceChangedTensorWrapper(const at::Tensor& tensor) : at::Tensor(tensor) { + if (!defined() || is_cpu()) { + return; + } + saveForRevert_.emplace(unsafeGetTensorImpl(), device()); + // NOTE: CUDA allocators may have not been initialized if we were using DIPU allocators. + // We have to do this explicitly for potential allocations in op workspaces. + at::globalContext().lazyInitCUDA(); + at::Device newDevice{at::DeviceType::CUDA, device().index()}; + setTensorImplDeviceUnsafe({unsafeGetTensorImpl(), newDevice}); +} + +UnsafelyDeviceChangedTensorWrapper::~UnsafelyDeviceChangedTensorWrapper() { + if (saveForRevert_.has_value()) { + setTensorImplDeviceUnsafe(*saveForRevert_); + } +} + +UnsafelyDeviceChangedTensorWrapper buildATenUnsafe(diopiConstTensorHandle_t tensor) { + if (tensor == nullptr) { + return {}; + } + auto& atTensor = *reinterpret_cast(const_cast(tensor)); + return UnsafelyDeviceChangedTensorWrapper::createFromTensor(atTensor); +} + +void UnsafelyDeviceChangedTensorWrapper::setTensorImplDeviceUnsafe(const TensorImplAndDevice& tensorAndDevice) { + const auto& [tensorImpl, device] = tensorAndDevice; + auto& storage = const_cast(tensorImpl->unsafe_storage()); + auto& dataPtr = const_cast(storage.data_ptr()); + dataPtr.unsafe_set_device(device); + tensorImpl->set_storage_keep_dtype(std::move(storage)); + tensorImpl->_change_backend_component_keys(device); +} + +namespace { + +template +class BuildATenDeviceApi {}; + +template <> +class BuildATenDeviceApi { +public: + static void lazyInitDevice() {} + static at::Device device(diopiConstTensorHandle_t /*unused*/) { return {at::DeviceType::CPU}; } + static at::Tensor empty(at::IntArrayRef size, at::ScalarType dtype, at::Device /*unused*/) { + return at::detail::empty_cpu(size, dtype, /*pin_memory=*/false, /*memory_format_opt=*/c10::nullopt); + } +}; + +template <> +class BuildATenDeviceApi { +public: + static void lazyInitDevice() { at::globalContext().lazyInitCUDA(); } + static at::Device device(diopiConstTensorHandle_t tensor) { + diopiDeviceIndex_t deviceIndex; + diopiGetTensorDeviceIndex(tensor, &deviceIndex); + return {at::DeviceType::CUDA, deviceIndex}; + } + static at::Tensor empty(at::IntArrayRef size, at::ScalarType dtype, at::Device device) { + return at::detail::empty_cuda(size, dtype, device, /*memory_format_opt=*/c10::nullopt); + } +}; + +template +at::Tensor buildATenSafeImpl(diopiConstTensorHandle_t tensor) { + diopiSize_t shape; + diopiGetTensorShape(tensor, &shape); + at::IntArrayRef atSizes(shape.data, shape.len); + + diopiDtype_t dtype; + diopiGetTensorDtype(tensor, &dtype); + auto atTypeMeta = getATenType(dtype); + auto atDtype = atTypeMeta.toScalarType(); + + auto atDevice = DeviceImpl::device(tensor); + + // NOTE: storage offset has been handled in `diopiGetTensorData` + void* data = nullptr; + diopiGetTensorData(const_cast(tensor), &data); + + if (data == nullptr) { + return DeviceImpl::empty(atSizes, atDtype, atDevice); + } + + // NOTE: CUDA allocators may have not been initialized if we were using DIPU allocators. + // We have to do this explicitly for potential allocations in op workspaces. + DeviceImpl::lazyInitDevice(); + + // PERF: It would be faster if we can obtain and reuse the storage from tensor. + // However we cannot assume diopiTensorHandle_t to be a wrapper of at::Tensor. + // So we have to create a new storage (offset = 0) whose data_ptr points to + // the same address but with an empty dtor (to avoid double-free). + + diopiSize_t stride; + diopiGetTensorStride(tensor, &stride); + at::IntArrayRef atStrides(stride.data, stride.len); + + auto storageNBytes = at::detail::computeStorageNbytes(atSizes, atStrides, atTypeMeta.itemsize()); + + // NOTE: in this way, data_ptr will have an empty destructor + at::Storage storage{at::Storage::use_byte_size_t{}, storageNBytes, /*data_ptr=*/{data, atDevice}}; + + auto dk = at::computeDispatchKey(atDtype, /*layout=*/c10::nullopt, atDevice); + at::Tensor atTensor = at::detail::make_tensor(std::move(storage), dk, atTypeMeta); + atTensor.unsafeGetTensorImpl()->set_sizes_and_strides(atSizes, atStrides); + + return atTensor; +} + +} // namespace + +at::Tensor buildATenSafe(diopiConstTensorHandle_t tensor) { + if (tensor == nullptr) { + return at::Tensor(); + } + + diopiDevice_t device; + diopiGetTensorDevice(tensor, &device); + switch (device) { + case diopi_host: + return buildATenSafeImpl>(tensor); + case diopi_device: + return buildATenSafeImpl>(tensor); + default: + TORCH_CHECK(false, "Invalid device type encountered in buildATen: ", device); + return {}; + } +} + +} // namespace impl::aten diff --git a/impl/torch/build_aten.hpp b/impl/torch/build_aten.hpp new file mode 100644 index 0000000000..4a4b099b41 --- /dev/null +++ b/impl/torch/build_aten.hpp @@ -0,0 +1,99 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "diopi/diopirt.h" + +namespace impl::aten { + +// This class is a wrapper around an at::Tensor, which changes the device and dispatch key of the binded at::TensorImpl from any device (e.g. XPU) to CUDA, and +// revert it back when the wrapper is destroyed. +// The wrapper is designed to be implicitly converted to an at::Tensor (object slicing), so that it can be used in place of an at::Tensor. +class UnsafelyDeviceChangedTensorWrapper : public at::Tensor { +public: + static UnsafelyDeviceChangedTensorWrapper createFromTensor(const at::Tensor& tensor) { return UnsafelyDeviceChangedTensorWrapper(tensor); } + UnsafelyDeviceChangedTensorWrapper() = default; + ~UnsafelyDeviceChangedTensorWrapper(); + UnsafelyDeviceChangedTensorWrapper(const UnsafelyDeviceChangedTensorWrapper& other) : at::Tensor(other) {} + UnsafelyDeviceChangedTensorWrapper(UnsafelyDeviceChangedTensorWrapper&& other) : at::Tensor(std::move(other)) { saveForRevert_.swap(other.saveForRevert_); } + UnsafelyDeviceChangedTensorWrapper& operator=(const UnsafelyDeviceChangedTensorWrapper& other) = delete; + UnsafelyDeviceChangedTensorWrapper& operator=(UnsafelyDeviceChangedTensorWrapper&& other) { + at::Tensor::operator=(std::move(other)); + saveForRevert_.swap(other.saveForRevert_); + return *this; + } + UnsafelyDeviceChangedTensorWrapper& operator=(const at::Tensor& other) { + at::Tensor::operator=(other); + return *this; + } + UnsafelyDeviceChangedTensorWrapper& operator=(at::Tensor&& other) { + at::Tensor::operator=(std::move(other)); + return *this; + } + +private: + explicit UnsafelyDeviceChangedTensorWrapper(const at::Tensor& tensor); + using TensorImplAndDevice = std::pair; + static void setTensorImplDeviceUnsafe(const TensorImplAndDevice& tensorAndDevice); + c10::optional saveForRevert_ = c10::nullopt; +}; + +// WARNING: This function is UNSAFE. It is the caller's responsibility to ensure that: +// 1. The returned wrapper is not destroyed when its sliced at::Tensor is still in use in DIOPI. +// 2. The input diopiConstTensorHandle_t is actually a reinterpret_cast of an at::Tensor*. +// 3. The input tensor and its storage are not used in another thread during the lifetime of the returned wrapper. +[[nodiscard]] UnsafelyDeviceChangedTensorWrapper buildATenUnsafe(diopiConstTensorHandle_t tensor); + +[[nodiscard]] at::Tensor buildATenSafe(diopiConstTensorHandle_t tensor); + +[[nodiscard]] inline auto buildATen(diopiConstTensorHandle_t tensor) { +#if DIOPI_TORCH_UNSAFE_BUILDATEN + return buildATenUnsafe(tensor); +#else + return buildATenSafe(tensor); +#endif +} + +template +[[nodiscard]] auto buildATenList(T* tensors, int64_t numTensors) { + using TensorType = decltype(buildATen(std::declval())); + c10::SmallVector vecAtTensor; + vecAtTensor.reserve(numTensors); + std::transform(tensors, tensors + numTensors, std::back_inserter(vecAtTensor), [](auto tensor) { return buildATen(tensor); }); + return vecAtTensor; +} + +// These macros is designed to avoid early destruction of the wrapper when build optional at::Tensor. +#define DIOPI_IMPL_BUILD_ATEN_LIST(atTensors, diopiTensors, numTensors) \ + auto atTensors##__MAYBE_WRAPPER = ::impl::aten::buildATenList(diopiTensors, numTensors); \ + c10::SmallVector atTensors; \ + atTensors.reserve(numTensors); \ + std::transform(atTensors##__MAYBE_WRAPPER.begin(), atTensors##__MAYBE_WRAPPER.end(), std::back_inserter(atTensors), [](auto& tensor) { \ + return static_cast(tensor); \ + }); +#define DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atTensor, diopiTensor) \ + auto atTensor##__MAYBE_WRAPPER = ::impl::aten::buildATen(diopiTensor); \ + c10::optional atTensor; \ + if (atTensor##__MAYBE_WRAPPER.defined()) { \ + atTensor = atTensor##__MAYBE_WRAPPER; \ + } +#define DIOPI_IMPL_BUILD_ATEN_OPTIONAL_LIST(atTensors, diopiTensors, numTensors) \ + auto atTensors##__MAYBE_WRAPPER = ::impl::aten::buildATenList(diopiTensors, numTensors); \ + c10::List> atTensors; \ + atTensors.reserve(numTensors); \ + std::transform(atTensors##__MAYBE_WRAPPER.begin(), atTensors##__MAYBE_WRAPPER.end(), std::back_inserter(atTensors), [](auto& tensor) { \ + return tensor.defined() ? c10::optional(tensor) : c10::nullopt; \ + }); + +} // namespace impl::aten diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index beac2dc9da..2a17e36424 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -6,11 +6,16 @@ #include #include #include -#include #include #include #include +// clang-format off +// NOTE: this header does not include all its dependencies, so we need to keep the order of the includes +#include +// clang-format on + +#include #include #ifdef USE_HIP @@ -19,6 +24,10 @@ #define FLT_MIN __FLT_MIN__ +#define CALL_ATEN_FUNC(func, ...) at::func(__VA_ARGS__) + +#define CALL_ATEN_CUDA_FUNC(func, ...) at::cuda::func(__VA_ARGS__) + #include "../helper.hpp" #include "../vision_kernel.h" @@ -58,36 +67,37 @@ const char* diopiGetImplVersion() { diopiError_t diopiRelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::relu_out(atOut, atInput); + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + at::native::copy_(atOut, atInput, true); + CALL_ATEN_CUDA_FUNC(relu_, atOut); return diopiSuccess; } diopiError_t diopiReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::relu_(atInput); + auto atInput = impl::aten::buildATen(input); + CALL_ATEN_CUDA_FUNC(relu_, atInput); return diopiSuccess; } diopiError_t diopiLeakyRelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* negative_slope) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atSlope = impl::aten::buildAtScalar(negative_slope); - at::leaky_relu_out(atOut, atInput, atSlope); + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atSlope = impl::aten::buildAtScalar(negative_slope); + CALL_ATEN_CUDA_FUNC(leaky_relu_out, atOut, atInput, atSlope); return diopiSuccess; } diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* negative_slope) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atSlope = impl::aten::buildAtScalar(negative_slope); - at::leaky_relu_(atInput, atSlope); + auto atInput = impl::aten::buildATen(input); + auto atSlope = impl::aten::buildAtScalar(negative_slope); + CALL_ATEN_CUDA_FUNC(leaky_relu_, atInput, atSlope); return diopiSuccess; } @@ -95,13 +105,14 @@ diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); bool atCeilMode = ceil_mode; - impl::aten::invokeATenFuncRet(ctx, at::max_pool2d, out, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + auto atOut = CALL_ATEN_FUNC(max_pool2d, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -109,15 +120,15 @@ diopiError_t diopiMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atIndices = impl::aten::buildATen(indices); + auto atOut = impl::aten::buildATen(out); + auto atIndices = impl::aten::buildATen(indices); bool atCeilMode = ceil_mode; - at::max_pool2d_with_indices_out(atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + CALL_ATEN_CUDA_FUNC(max_pool2d_with_indices_out, atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); return diopiSuccess; } @@ -129,11 +140,11 @@ diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHand diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - at::div_out(atOut, atInput, atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_out, atOut, atInput, atOther, roundingMode); return diopiSuccess; } @@ -144,10 +155,10 @@ diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo */ diopiError_t diopiDivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - atInput.div_(atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_, atInput, atOther, roundingMode); return diopiSuccess; } @@ -163,7 +174,7 @@ diopiError_t diopiDivScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, d auto atOther = impl::aten::buildAtScalar(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); auto atOut = impl::aten::buildATen(out); - at::div_out(atOut, atInput, c10::scalar_to_tensor(atOther), roundingMode); + CALL_ATEN_CUDA_FUNC(div_out, atOut, atInput, c10::scalar_to_tensor(atOther), roundingMode); return diopiSuccess; } @@ -177,7 +188,7 @@ diopiError_t diopiDivInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildAtScalar(other); auto roundingMode = impl::aten::getRoundingMode(rounding_mode); - atInput.div_(atOther, roundingMode); + CALL_ATEN_CUDA_FUNC(div_, atInput, c10::scalar_to_tensor(atOther), roundingMode); return diopiSuccess; } @@ -195,14 +206,16 @@ diopiError_t diopiConvolution2d(diopiContextHandle_t ctx, diopiTensorHandle_t ou if (torch::cuda::cudnn_is_available()) { DIOPI_CHECK(atInput.options().type_equal(atWeight.options()), "Input type and weight type should be the same"); DIOPI_CHECK(!atBias.defined() || (atInput.options().type_equal(atBias.options())), "Input type and bias type should be the same"); - at::cudnn_convolution_out(atOut, atInput, atWeight, atPadding, atStride, atDilation, groups, false, false, true); + auto tempOut = CALL_ATEN_CUDA_FUNC(cudnn_convolution, atInput, atWeight, atPadding, atStride, atDilation, groups, false, false, true); + at::native::copy_(atOut, tempOut, true); if (atBias.defined()) { std::vector shape(atInput.dim(), 1); shape[1] = -1; - atOut.add_(atBias.reshape(shape)); + CALL_ATEN_CUDA_FUNC(add_, atOut, atBias.reshape(shape)); } } else { - at::convolution_out(atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(convolution_out, atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); } return diopiSuccess; @@ -239,7 +252,7 @@ diopiError_t diopiBmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo auto atInput = impl::aten::buildATen(input); auto atMat2 = impl::aten::buildATen(mat2); auto atOut = impl::aten::buildATen(out); - at::bmm_out(atOut, atInput, atMat2); + CALL_ATEN_CUDA_FUNC(bmm_out, atOut, atInput, atMat2); return diopiSuccess; } @@ -251,7 +264,7 @@ diopiError_t diopiBaddbmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atOut = impl::aten::buildATen(out); auto atBatch1 = impl::aten::buildATen(batch1); auto atBatch2 = impl::aten::buildATen(batch2); - at::baddbmm_out(atOut, atInput, atBatch1, atBatch2, beta, alpha); + CALL_ATEN_CUDA_FUNC(baddbmm_out, atOut, atInput, atBatch1, atBatch2, beta, alpha); return diopiSuccess; } @@ -262,7 +275,7 @@ diopiError_t diopiBaddbmmInp(diopiContextHandle_t ctx, diopiTensorHandle_t input auto atInput = impl::aten::buildATen(input); auto atBatch1 = impl::aten::buildATen(batch1); auto atBatch2 = impl::aten::buildATen(batch2); - atInput.baddbmm_(atBatch1, atBatch2, beta, alpha); + CALL_ATEN_CUDA_FUNC(baddbmm_, atInput, atBatch1, atBatch2, beta, alpha); return diopiSuccess; } @@ -270,12 +283,12 @@ diopiError_t diopiBaddbmmInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiAddcmul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t tensor1, diopiConstTensorHandle_t tensor2, const diopiScalar_t* value) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - at::addcmul_out(atOut, atInput, atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcmul_out, atOut, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } @@ -287,19 +300,20 @@ diopiError_t diopiAddcmulInp(diopiContextHandle_t ctx, diopiTensorHandle_t input auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - atInput.addcmul_(atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcmul_, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } diopiError_t diopiMatmul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); // Note(huqingqing): pytorch optimize the bmm case by folding the batch into the first dimension. // It changes the shape of output and causes warnning when using matmul_out. - at::matmul_out(atOut, atInput, atOther); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(matmul_out, atOut, atInput, atOther); return diopiSuccess; } @@ -312,7 +326,7 @@ diopiError_t diopiAddcdiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::addcdiv_out(atOut, atInput, atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcdiv_out, atOut, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } @@ -324,7 +338,7 @@ diopiError_t diopiAddcdivInp(diopiContextHandle_t ctx, diopiTensorHandle_t input auto atTensor1 = impl::aten::buildATen(tensor1); auto atTensor2 = impl::aten::buildATen(tensor2); auto atValue = impl::aten::buildAtScalar(value); - atInput.addcdiv_(atTensor1, atTensor2, atValue); + CALL_ATEN_CUDA_FUNC(addcdiv_, atInput, atTensor1, atTensor2, atValue); return diopiSuccess; } @@ -339,7 +353,7 @@ diopiError_t diopiAddmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi auto atBeta = impl::aten::buildAtScalar(beta); auto atAlpha = impl::aten::buildAtScalar(alpha); auto atOut = impl::aten::buildATen(out); - at::addmm_out(atOut, atInput, atMax1, atMax2, atBeta, atAlpha); + CALL_ATEN_CUDA_FUNC(addmm_out, atOut, atInput, atMax1, atMax2, atBeta, atAlpha); return diopiSuccess; } @@ -354,7 +368,7 @@ diopiError_t diopiMean(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::mean_out(atOut, atInput, atDim, keepdim); // TODO(fengsibo): use default type instead + CALL_ATEN_CUDA_FUNC(mean_out, atOut, atInput, atDim, keepdim); // TODO(fengsibo): use default type instead return diopiSuccess; } @@ -370,7 +384,7 @@ diopiError_t diopiSum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::sum_out(atOut, atInput, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(sum_out, atOut, atInput, atDim, keepdim); return diopiSuccess; } @@ -384,7 +398,7 @@ diopiError_t diopiStd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::std_out(atOut, atInput, atDim, unbiased, keepdim); + CALL_ATEN_CUDA_FUNC(std_out, atOut, atInput, atDim, unbiased, keepdim); return diopiSuccess; } @@ -398,7 +412,7 @@ diopiError_t diopiMin(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiTe if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::min_out(atOut, atIndices, atInput, dim, keepdim); + CALL_ATEN_CUDA_FUNC(min_out, atOut, atIndices, atInput, dim, keepdim); return diopiSuccess; } @@ -406,8 +420,8 @@ diopiError_t diopiMin(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiTe diopiError_t diopiMinAll(diopiContextHandle_t ctx, diopiTensorHandle_t min, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - auto atOut = impl::aten::buildATen(min); - impl::aten::invokeATenFuncRet(ctx, at::min, min, atInput); + auto atOut = CALL_ATEN_CUDA_FUNC(min, atInput); + impl::aten::updateATen2Tensor(ctx, atOut, min); return diopiSuccess; } @@ -421,7 +435,7 @@ diopiError_t diopiMax(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiTe if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::max_out(atOut, atIndices, atInput, dim, keepdim); + CALL_ATEN_CUDA_FUNC(max_out, atOut, atIndices, atInput, dim, keepdim); return diopiSuccess; } @@ -429,7 +443,8 @@ diopiError_t diopiMax(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiTe diopiError_t diopiMaxAll(diopiContextHandle_t ctx, diopiTensorHandle_t max, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - impl::aten::invokeATenFuncRet(ctx, at::max, max, atInput); + auto atOut = CALL_ATEN_CUDA_FUNC(max, atInput); + impl::aten::updateATen2Tensor(ctx, atOut, max); return diopiSuccess; } @@ -443,9 +458,9 @@ diopiError_t diopiAny(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo keepdim = true; } if (dim == nullptr) { - at::any_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(any_out, atOut, atInput); } else { - at::any_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(any_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; @@ -460,9 +475,9 @@ diopiError_t diopiAll(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCo keepdim = true; } if (dim == nullptr) { - at::all_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(all_out, atOut, atInput); } else { - at::all_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(all_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; @@ -472,7 +487,7 @@ diopiError_t diopiSoftmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::softmax_out(atOut, atInput, dim); + CALL_ATEN_CUDA_FUNC(_softmax_out, atOut, atInput, dim, false); return diopiSuccess; } @@ -481,7 +496,7 @@ diopiError_t diopiLogSoftmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::log_softmax_out(atOut, atInput, dim); + CALL_ATEN_CUDA_FUNC(_log_softmax_out, atOut, atInput, dim, false); return diopiSuccess; } @@ -491,7 +506,7 @@ diopiError_t diopiIndexSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); - at::index_select_out(atOut, atInput, dim, atIndex); + CALL_ATEN_CUDA_FUNC(index_select_out, atOut, atInput, dim, atIndex); return diopiSuccess; } @@ -499,7 +514,7 @@ diopiError_t diopiIndexSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiError_t diopiSelect(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim, int64_t index) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - at::Tensor atOut = at::select(atInput, dim, index).contiguous(); + auto atOut = at::select(atInput, dim, index).contiguous(); impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; @@ -512,7 +527,8 @@ diopiError_t diopiMaskedScatter(diopiContextHandle_t ctx, diopiTensorHandle_t ou auto atMask = impl::aten::buildATen(mask); auto atSource = impl::aten::buildATen(source); auto atOut = impl::aten::buildATen(out); - at::masked_scatter_out(atOut, atInput, atMask, atSource); + at::native::copy_(atOut, atInput, true); + CALL_ATEN_CUDA_FUNC(masked_scatter_, atOut, atMask, atSource); return diopiSuccess; } @@ -545,7 +561,8 @@ diopiError_t diopiLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop auto atInput = impl::aten::buildATen(input); auto atWeight = impl::aten::buildATen(weight); auto atBias = impl::aten::buildATen(bias); - at::linear_out(atOut, atInput, atWeight, atBias); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(linear_out, atOut, atInput, atWeight, atBias); return diopiSuccess; } @@ -647,7 +664,7 @@ diopiError_t diopiEmbeddingRenorm_(diopiContextHandle_t ctx, diopiTensorHandle_t impl::aten::setCurStream(ctx); auto atSelf = impl::aten::buildATen(inout); auto atIndices = impl::aten::buildATen(indices); - at::embedding_renorm_(atSelf, atIndices, max_norm, norm_type); + CALL_ATEN_CUDA_FUNC(embedding_renorm_, atSelf, atIndices, max_norm, norm_type); return diopiSuccess; } @@ -658,7 +675,8 @@ diopiError_t diopiEmbedding(diopiContextHandle_t ctx, diopiTensorHandle_t out, d auto atWeight = impl::aten::buildATen(weight); auto atIndices = impl::aten::buildATen(indices); auto atOut = impl::aten::buildATen(out); - at::embedding_out(atOut, atWeight, atIndices, paddingIdx, scaleGradByFreq, sparse); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(embedding_out, atOut, atWeight, atIndices, paddingIdx, scaleGradByFreq, sparse); return diopiSuccess; } @@ -667,7 +685,7 @@ diopiError_t diopiTril(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::tril_out(atOut, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(tril_out, atOut, atInput, diagonal); return diopiSuccess; } @@ -675,9 +693,9 @@ diopiError_t diopiTril(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC diopiError_t diopiCat(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t* tensors, int64_t insNum, int64_t dim) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(tensors); - auto tensorList = impl::aten::buildATenList(tensors, insNum); + DIOPI_IMPL_BUILD_ATEN_LIST(tensorList, tensors, insNum); auto atOut = impl::aten::buildATen(out); - at::cat_out(atOut, tensorList, dim); + CALL_ATEN_CUDA_FUNC(cat_out, atOut, tensorList, dim); return diopiSuccess; } @@ -699,10 +717,11 @@ diopiError_t diopiSplitWithSizes(diopiContextHandle_t ctx, diopiTensorHandle_t* diopiError_t diopiStack(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t* tensors, int64_t numTensors, int64_t dim) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(tensors); - auto tensorList = impl::aten::buildATenList(tensors, numTensors); + DIOPI_IMPL_BUILD_ATEN_LIST(tensorList, tensors, numTensors); auto atOut = impl::aten::buildATen(out); - at::stack_out(atOut, tensorList, dim); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(stack_out, atOut, tensorList, dim); return diopiSuccess; } @@ -714,10 +733,10 @@ diopiError_t diopiSort(diopiContextHandle_t ctx, diopiTensorHandle_t values, dio auto atValues = impl::aten::buildATen(values); auto atIndices = impl::aten::buildATen(indices); #if TORCH_MM_VERSION <= TORCH_1_8_MM_VERSION - at::sort_out(atValues, atIndices, atInput, dim, descending); + CALL_ATEN_CUDA_FUNC(sort_out, atValues, atIndices, atInput, dim, descending); #else c10::optional atStable = stable ? c10::optional(*stable) : c10::optional(false); - at::sort_out(atValues, atIndices, atInput, atStable, dim, descending); + CALL_ATEN_CUDA_FUNC(sort_out, atValues, atIndices, atInput, atStable, dim, descending); #endif return diopiSuccess; @@ -729,16 +748,16 @@ diopiError_t diopiTopk(diopiContextHandle_t ctx, diopiTensorHandle_t values, dio auto atInput = impl::aten::buildATen(input); auto atValues = impl::aten::buildATen(values); auto atIndices = impl::aten::buildATen(indices); - at::topk_out(atValues, atIndices, atInput, k, dim, largest, sorted); + CALL_ATEN_CUDA_FUNC(topk_out, atValues, atIndices, atInput, k, dim, largest, sorted); return diopiSuccess; } diopiError_t diopiTranspose(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim0, int64_t dim1) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - impl::aten::invokeATenFuncRet(ctx, at::transpose, out, atInput, dim0, dim1); + auto atOut = CALL_ATEN_FUNC(transpose, atInput, dim0, dim1); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -746,7 +765,8 @@ diopiError_t diopiTranspose(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiOneHot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t numClasses) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - impl::aten::invokeATenFuncRet(ctx, at::one_hot, out, atInput, numClasses); + auto atOut = CALL_ATEN_FUNC(one_hot, atInput, numClasses); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -754,27 +774,27 @@ diopiError_t diopiOneHot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiWhere(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t condition, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATen(out); auto atCondition = impl::aten::buildATen(condition); auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); - at::where_out(atOut, atCondition, atInput, atOther); + CALL_ATEN_CUDA_FUNC(where_out, atOut, atCondition, atInput, atOther); return diopiSuccess; } diopiError_t diopiSin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::sin_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sin_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::sin_(atInput); return diopiSuccess; @@ -782,16 +802,16 @@ diopiError_t diopiSinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiCos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::cos_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(cos_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::cos_(atInput); return diopiSuccess; @@ -799,16 +819,16 @@ diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiAbs(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::abs_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(abs_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiAbsInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::abs_(atInput); return diopiSuccess; @@ -816,16 +836,16 @@ diopiError_t diopiAbsInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiSqrt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::sqrt_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sqrt_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::sqrt_(atInput); return diopiSuccess; @@ -833,16 +853,16 @@ diopiError_t diopiSqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiRsqrt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::rsqrt_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(rsqrt_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiRsqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::rsqrt_(atInput); return diopiSuccess; @@ -850,16 +870,16 @@ diopiError_t diopiRsqrtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) diopiError_t diopiFloor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::floor_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(floor_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiFloorInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::floor_(atInput); return diopiSuccess; @@ -867,16 +887,16 @@ diopiError_t diopiFloorInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) diopiError_t diopiNeg(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::neg_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(neg_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiNegInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::neg_(atInput); return diopiSuccess; @@ -884,25 +904,25 @@ diopiError_t diopiNegInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiSign(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::sign_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sign_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiTanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::tanh_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(tanh_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiTanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::tanh_(atInput); return diopiSuccess; @@ -910,16 +930,16 @@ diopiError_t diopiTanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiAtan(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::atan_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(atan_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiAtanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::atan_(atInput); return diopiSuccess; @@ -927,16 +947,16 @@ diopiError_t diopiAtanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiSigmoid(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::sigmoid_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sigmoid_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiSigmoidInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::sigmoid_(atInput); return diopiSuccess; @@ -944,7 +964,7 @@ diopiError_t diopiSigmoidInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiSiluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::silu_(atInput); return diopiSuccess; @@ -952,9 +972,9 @@ diopiError_t diopiSiluInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiSilu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::silu_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(silu_out, atOut, atInput); return diopiSuccess; } @@ -963,23 +983,23 @@ diopiError_t diopiSiluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gra auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::silu_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(silu_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiExp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::exp_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(exp_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiExpInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::exp_(atInput); return diopiSuccess; @@ -987,16 +1007,16 @@ diopiError_t diopiExpInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiLog(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::log_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(log_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLogInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::log_(atInput); return diopiSuccess; @@ -1004,16 +1024,16 @@ diopiError_t diopiLogInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiLog2(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::log2_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(log2_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLog2Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::log2_(atInput); return diopiSuccess; @@ -1021,16 +1041,16 @@ diopiError_t diopiLog2Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiLog10(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::log10_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(log10_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiLog10Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::log10_(atInput); return diopiSuccess; @@ -1038,16 +1058,16 @@ diopiError_t diopiLog10Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) diopiError_t diopiErf(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::erf_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(erf_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiErfInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::erf_(atInput); return diopiSuccess; @@ -1055,48 +1075,48 @@ diopiError_t diopiErfInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { diopiError_t diopiPowScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, const diopiScalar_t* input, diopiConstTensorHandle_t exponent) { impl::aten::setCurStream(ctx); - at::Tensor atExponent = impl::aten::buildATen(exponent); - at::Scalar atInput = impl::aten::buildAtScalar(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + auto atExponent = impl::aten::buildATen(exponent); + auto atInput = impl::aten::buildAtScalar(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPow(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* exponent) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atExponent = impl::aten::buildAtScalar(exponent); - at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + auto atInput = impl::aten::buildATen(input); + auto atExponent = impl::aten::buildAtScalar(exponent); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* exponent) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atExponent = impl::aten::buildAtScalar(exponent); - atInput.pow_(atExponent); + auto atInput = impl::aten::buildATen(input); + auto atExponent = impl::aten::buildAtScalar(exponent); + CALL_ATEN_CUDA_FUNC(pow_, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t exponent) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atExponent = impl::aten::buildATen(exponent); - at::Tensor atOut = impl::aten::buildATen(out); - at::pow_out(atOut, atInput, atExponent); + auto atInput = impl::aten::buildATen(input); + auto atExponent = impl::aten::buildATen(exponent); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(pow_out, atOut, atInput, atExponent); return diopiSuccess; } diopiError_t diopiPowInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t exponent) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atExponent = impl::aten::buildATen(exponent); - atInput.pow_(atExponent); + auto atInput = impl::aten::buildATen(input); + auto atExponent = impl::aten::buildATen(exponent); + CALL_ATEN_CUDA_FUNC(pow_, atInput, atExponent); return diopiSuccess; } @@ -1104,21 +1124,21 @@ diopiError_t diopiPowInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiAdd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - at::Tensor atOut = impl::aten::buildATen(out); - at::add_out(atOut, atInput, atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(add_out, atOut, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiAddInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.add_(atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + CALL_ATEN_CUDA_FUNC(add_, atInput, atOther, atAlpha); return diopiSuccess; } @@ -1126,21 +1146,21 @@ diopiError_t diopiAddInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, di diopiError_t diopiAddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - at::Tensor atOut = impl::aten::buildATen(out); - at::add_out(atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(add_out, atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.add_(atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + CALL_ATEN_CUDA_FUNC(add_, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } @@ -1148,21 +1168,21 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiSub(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - at::Tensor atOut = impl::aten::buildATen(out); - at::sub_out(atOut, atInput, atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sub_out, atOut, atInput, atOther, atAlpha); return diopiSuccess; } diopiError_t diopiSubInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.sub_(atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + CALL_ATEN_CUDA_FUNC(sub_, atInput, atOther, atAlpha); return diopiSuccess; } @@ -1170,408 +1190,408 @@ diopiError_t diopiSubInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, di diopiError_t diopiSubScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - at::Tensor atOut = impl::aten::buildATen(out); - at::sub_out(atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(sub_out, atOut, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiSubInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Scalar atAlpha = impl::aten::buildAtScalar(alpha); - atInput.sub_(atOther, atAlpha); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atAlpha = impl::aten::buildAtScalar(alpha); + CALL_ATEN_CUDA_FUNC(sub_, atInput, c10::scalar_to_tensor(atOther), atAlpha); return diopiSuccess; } diopiError_t diopiMul(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::mul_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiMulInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.mul_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(mul_, atInput, atOther); return diopiSuccess; } diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::mul_out(atOut, atInput, c10::scalar_to_tensor(atOther)); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.mul_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(mul_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::ge_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(ge_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.ge_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(ge_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::ge_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(ge_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.ge_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(ge_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::gt_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(gt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.gt_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(gt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::gt_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(gt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiGtInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.gt_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(gt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::le_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(le_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.le_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(le_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::le_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(le_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.le_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(le_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLt(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::lt_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(lt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.lt_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(lt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::lt_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(lt_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLtInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.lt_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(lt_, atInput, atOther); return diopiSuccess; } diopiError_t diopiEq(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::eq_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(eq_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.eq_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(eq_, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::eq_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(eq_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiEqInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.eq_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(eq_, atInput, atOther); return diopiSuccess; } diopiError_t diopiNe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::ne_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(ne_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.ne_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(ne_, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::ne_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(ne_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiNeInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.ne_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(ne_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAnd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_and_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(bitwise_and_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAndInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.bitwise_and_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(bitwise_and_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseAndScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_and_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(bitwise_and_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseAndInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.bitwise_and_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(bitwise_and_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseOr(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_or_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(bitwise_or_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseOrInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.bitwise_or_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(bitwise_or_, atInput, atOther); return diopiSuccess; } diopiError_t diopiBitwiseOrScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::bitwise_or_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(bitwise_or_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiBitwiseOrInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atOther = impl::aten::buildAtScalar(other); - atInput.bitwise_or_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(bitwise_or_, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } diopiError_t diopiLogicalAnd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::logical_and_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(logical_and_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalAndInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.logical_and_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(logical_and_out, atInput, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalOr(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - at::Tensor atOut = impl::aten::buildATen(out); - at::logical_or_out(atOut, atInput, atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(logical_or_out, atOut, atInput, atOther); return diopiSuccess; } diopiError_t diopiLogicalOrInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOther = impl::aten::buildATen(other); - atInput.logical_or_(atOther); + auto atInput = impl::aten::buildATen(input); + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(logical_or_out, atInput, atInput, atOther); return diopiSuccess; } diopiError_t diopiClampInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* min, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); c10::optional atMin = c10::optional(); if (min != nullptr) { atMin = impl::aten::buildAtScalar(min); @@ -1588,7 +1608,7 @@ diopiError_t diopiClampInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t i diopiError_t diopiClampScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* min, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); c10::optional atMin = c10::optional(); if (min != nullptr) { atMin = impl::aten::buildAtScalar(min); @@ -1597,16 +1617,16 @@ diopiError_t diopiClampScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, if (max != nullptr) { atMax = impl::aten::buildAtScalar(max); } - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin, atMax); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiClampMaxInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atMax = impl::aten::buildAtScalar(max); + auto atInput = impl::aten::buildATen(input); + auto atMax = impl::aten::buildAtScalar(max); at::clamp_max_(atInput, atMax); return diopiSuccess; @@ -1614,10 +1634,10 @@ diopiError_t diopiClampMaxInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiClampMaxScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atMax = impl::aten::buildAtScalar(max); - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_max_out(atOut, atInput, atMax); + auto atInput = impl::aten::buildATen(input); + auto atMax = impl::aten::buildAtScalar(max); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_max_out, atOut, atInput, atMax); return diopiSuccess; } @@ -1625,15 +1645,9 @@ diopiError_t diopiClampMaxScalar(diopiContextHandle_t ctx, diopiTensorHandle_t o #if TORCH_MM_VERSION > TORCH_1_9_MM_VERSION diopiError_t diopiClampInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t min, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - c10::optional atMin = c10::optional(); - if (min != nullptr) { - atMin = impl::aten::buildATen(min); - } - c10::optional atMax = c10::optional(); - if (max != nullptr) { - atMax = impl::aten::buildATen(max); - } + auto atInput = impl::aten::buildATen(input); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atMin, min); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atMax, max); at::clamp_(atInput, atMin, atMax); return diopiSuccess; @@ -1642,25 +1656,19 @@ diopiError_t diopiClampInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiError_t diopiClamp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t min, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - c10::optional atMin = c10::optional(); - if (min != nullptr) { - atMin = impl::aten::buildATen(min); - } - c10::optional atMax = c10::optional(); - if (max != nullptr) { - atMax = impl::aten::buildATen(max); - } - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin, atMax); + auto atInput = impl::aten::buildATen(input); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atMin, min); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atMax, max); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin, atMax); return diopiSuccess; } diopiError_t diopiClampMaxInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMax = impl::aten::buildATen(max); + auto atInput = impl::aten::buildATen(input); + auto atMax = impl::aten::buildATen(max); at::clamp_max_(atInput, atMax); return diopiSuccess; @@ -1668,18 +1676,18 @@ diopiError_t diopiClampMaxInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu diopiError_t diopiClampMax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t max) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMax = impl::aten::buildATen(max); - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_max_out(atOut, atInput, atMax); + auto atInput = impl::aten::buildATen(input); + auto atMax = impl::aten::buildATen(max); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_max_out, atOut, atInput, atMax); return diopiSuccess; } diopiError_t diopiClampMinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMin = impl::aten::buildATen(min); + auto atInput = impl::aten::buildATen(input); + auto atMin = impl::aten::buildATen(min); at::clamp_(atInput, atMin); return diopiSuccess; @@ -1687,10 +1695,10 @@ diopiError_t diopiClampMinInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu diopiError_t diopiClampMin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMin = impl::aten::buildATen(min); - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin); + auto atInput = impl::aten::buildATen(input); + auto atMin = impl::aten::buildATen(min); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin); return diopiSuccess; } @@ -1698,8 +1706,8 @@ diopiError_t diopiClampMin(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t diopiClampMinInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atMin = impl::aten::buildAtScalar(min); + auto atInput = impl::aten::buildATen(input); + auto atMin = impl::aten::buildAtScalar(min); at::clamp_(atInput, atMin); return diopiSuccess; @@ -1707,18 +1715,18 @@ diopiError_t diopiClampMinInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiClampMinScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* min) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atMin = impl::aten::buildAtScalar(min); - at::Tensor atOut = impl::aten::buildATen(out); - at::clamp_out(atOut, atInput, atMin); + auto atInput = impl::aten::buildATen(input); + auto atMin = impl::aten::buildAtScalar(min); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(clamp_out, atOut, atInput, atMin); return diopiSuccess; } diopiError_t diopiFill(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* value) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Scalar atValue = impl::aten::buildAtScalar(value); + auto atInput = impl::aten::buildATen(input); + auto atValue = impl::aten::buildAtScalar(value); at::fill_(atInput, atValue); return diopiSuccess; @@ -1726,17 +1734,17 @@ diopiError_t diopiFill(diopiContextHandle_t ctx, diopiTensorHandle_t input, cons diopiError_t diopiAdaptiveAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); - at::Tensor atOut = impl::aten::buildATen(out); - at::adaptive_avg_pool2d_out(atOut, atInput, atOutSize); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool2d_out, atOut, atInput, atOutSize); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOuts = at::adaptive_max_pool2d(atInput, atOutSize); impl::aten::updateATen2Tensor(ctx, std::get<0>(atOuts), out); @@ -1747,11 +1755,11 @@ diopiError_t diopiAdaptiveMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atIndices = impl::aten::buildATen(indices); - at::adaptive_max_pool2d_out(atOut, atIndices, atInput, atOutSize); + auto atOut = impl::aten::buildATen(out); + auto atIndices = impl::aten::buildATen(indices); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool2d_out, atOut, atIndices, atInput, atOutSize); return diopiSuccess; } @@ -1759,11 +1767,11 @@ diopiError_t diopiAdaptiveMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTe diopiError_t diopiAdaptiveMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atGradOutput = impl::aten::buildATen(grad_output); - at::Tensor atIndices = impl::aten::buildATen(indices); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_max_pool2d_backward_out(atGradInput, atGradOutput, atInput, atIndices); + auto atInput = impl::aten::buildATen(input); + auto atGradOutput = impl::aten::buildATen(grad_output); + auto atIndices = impl::aten::buildATen(indices); + auto atGradInput = impl::aten::buildATen(grad_input); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool2d_backward_out, atGradInput, atGradOutput, atInput, atIndices); return diopiSuccess; } @@ -1771,13 +1779,13 @@ diopiError_t diopiAdaptiveMaxPool2dBackward(diopiContextHandle_t ctx, diopiTenso diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode, bool count_include_pad, const int64_t* divisor_override) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); c10::optional atDivisorOverride = divisor_override ? c10::optional(*divisor_override) : c10::nullopt; - at::Tensor atOut = impl::aten::buildATen(out); - at::avg_pool2d_out(atOut, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(avg_pool2d_out, atOut, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); return diopiSuccess; } @@ -1785,16 +1793,18 @@ diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiDropout(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t mask, diopiConstTensorHandle_t input, double p, bool train, diopiGeneratorHandle_t generator) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); if (train) { at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atMask = impl::aten::buildATen(mask); + auto atOut = impl::aten::buildATen(out); + auto atMask = impl::aten::buildATen(mask); if (atInput.numel() == atMask.numel()) { - at::_fused_dropout_out(atOut, atMask, atInput, 1 - p, gen); + auto tempOut = CALL_ATEN_CUDA_FUNC(_fused_dropout, atInput, 1 - p, gen); + at::native::copy_(atOut, std::get<0>(tempOut), true); + at::native::copy_(atMask, std::get<1>(tempOut), true); } else { - atMask.bernoulli_(1 - p, gen); - at::mul_out(atOut, atInput, atMask); + CALL_ATEN_CUDA_FUNC(bernoulli_, atMask, 1 - p, gen); + CALL_ATEN_CUDA_FUNC(mul_out, atOut, atInput, atMask); atOut.div_(1 - p); } impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -1810,12 +1820,14 @@ diopiError_t diopiDropoutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input impl::aten::setCurStream(ctx); if (train) { at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atMask = impl::aten::buildATen(mask); + auto atInput = impl::aten::buildATen(input); + auto atMask = impl::aten::buildATen(mask); if (atInput.numel() == atMask.numel()) { - at::_fused_dropout_out(atInput, atMask, atInput, 1 - p, gen); + auto tempOut = CALL_ATEN_CUDA_FUNC(_fused_dropout, atInput, 1 - p, gen); + at::native::copy_(atInput, std::get<0>(tempOut), true); + at::native::copy_(atMask, std::get<1>(tempOut), true); } else { - atMask.bernoulli_(1 - p, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_, atMask, 1 - p, gen); atInput.mul_(atMask).div_(1 - p); } impl::aten::updateGeneratorHandleState(ctx, gen, generator); @@ -1827,16 +1839,17 @@ diopiError_t diopiDropoutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input diopiError_t diopiMSELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atTarget = impl::aten::buildATen(target); // Note(huqingqing): at::mse_loss_out reduce in the 0 dimension, which is different from at::mse_loss. // at::mse_loss reduce over all the dimensions. if (reduction == 0) { - at::Tensor atOut = impl::aten::buildATen(out); - at::mse_loss_out(atOut, atInput, atTarget, reduction); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(mse_loss_out, atOut, atInput, atTarget, reduction); } else { - impl::aten::invokeATenFuncRet(ctx, at::mse_loss, out, atInput, atTarget, reduction); + auto atOut = CALL_ATEN_FUNC(mse_loss, atInput, atTarget, reduction); + impl::aten::updateATen2Tensor(ctx, atOut, out); } return diopiSuccess; @@ -1845,19 +1858,19 @@ diopiError_t diopiMSELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio diopiError_t diopiSigmoidFocalLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t inputs, diopiConstTensorHandle_t targets, float alpha, float gamma, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(inputs); - at::Tensor atTarget = impl::aten::buildATen(targets); - at::Tensor atP = at::sigmoid(atInput); - at::Tensor atTerm1 = at::pow(1 - atP, gamma) * at::log(atP); - at::Tensor atTerm2 = at::pow(atP, gamma) * at::log(1 - atP); - at::Tensor atRes = -atTarget * atTerm1 * alpha - (1 - atTarget) * atTerm2 * (1 - alpha); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(inputs); + auto atTarget = impl::aten::buildATen(targets); + auto atP = at::sigmoid(atInput); + auto atTerm1 = at::pow(1 - atP, gamma) * at::log(atP); + auto atTerm2 = at::pow(atP, gamma) * at::log(1 - atP); + auto atRes = -atTarget * atTerm1 * alpha - (1 - atTarget) * atTerm2 * (1 - alpha); + auto atOut = impl::aten::buildATen(out); if (reduction == 0) { impl::aten::updateATen2Tensor(ctx, atRes, out); } else if (reduction == 1) { - at::mean_out(atOut, atRes, impl::aten::getSequence(atRes.dim())); + CALL_ATEN_CUDA_FUNC(mean_out, atOut, atRes, impl::aten::getSequence(atRes.dim())); } else if (reduction == 2) { - at::sum_out(atOut, atRes, impl::aten::getSequence(atRes.dim())); + CALL_ATEN_CUDA_FUNC(sum_out, atOut, atRes, impl::aten::getSequence(atRes.dim())); } else { NOT_SUPPORTED("sigmoid reduction type"); return diopiErrorOccurred; @@ -1870,15 +1883,16 @@ diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiTensorHandle_t running_mean, diopiTensorHandle_t running_var, bool training, double momentum, double eps) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atWeight = impl::aten::buildATen(weight); - at::Tensor atBias = impl::aten::buildATen(bias); - at::Tensor atRunningMean = impl::aten::buildATen(running_mean); - at::Tensor atRunningVar = impl::aten::buildATen(running_var); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atSaveMean = impl::aten::buildATen(save_mean); - at::Tensor atSaveInvstd = impl::aten::buildATen(save_invstd); - at::native_batch_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, atRunningMean, atRunningVar, training, momentum, eps); + auto atInput = impl::aten::buildATen(input); + auto atWeight = impl::aten::buildATen(weight); + auto atBias = impl::aten::buildATen(bias); + auto atRunningMean = impl::aten::buildATen(running_mean); + auto atRunningVar = impl::aten::buildATen(running_var); + auto atOut = impl::aten::buildATen(out); + auto atSaveMean = impl::aten::buildATen(save_mean); + auto atSaveInvstd = impl::aten::buildATen(save_invstd); + CALL_ATEN_CUDA_FUNC( + native_batch_norm_out, atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, atRunningMean, atRunningVar, training, momentum, eps); return diopiSuccess; } @@ -1886,8 +1900,8 @@ diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiSlice(diopiContextHandle_t ctx, diopiTensorHandle_t null_out, diopiConstTensorHandle_t input, int64_t dim, int64_t start, int64_t end, int64_t step) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = at::slice(atInput, dim, start, end, step).contiguous(); + auto atInput = impl::aten::buildATen(input); + auto atOut = at::slice(atInput, dim, start, end, step).contiguous(); impl::aten::updateATen2Tensor(ctx, atOut, null_out); return diopiSuccess; @@ -1896,17 +1910,9 @@ diopiError_t diopiSlice(diopiContextHandle_t ctx, diopiTensorHandle_t null_out, diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t* indices, int64_t nums) { impl::aten::setCurStream(ctx); DIOPI_CHECK(out != nullptr && indices != nullptr, "Not supported: out or indices is nullptr"); - at::Tensor atInput = impl::aten::buildATen(input); - c10::List> vecIdx; - vecIdx.reserve(nums); - for (size_t i = 0; i < nums; ++i) { - if (indices[i] == nullptr) { - vecIdx.emplace_back(c10::nullopt); - } else { - vecIdx.emplace_back(impl::aten::buildATen(indices[i])); - } - } - at::Tensor atOut = at::index(atInput, vecIdx).contiguous(); + auto atInput = impl::aten::buildATen(input); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL_LIST(vecIdx, indices, nums); + auto atOut = at::index(atInput, vecIdx).contiguous(); impl::aten::buildDiopiTensor(ctx, atOut, out); return diopiSuccess; @@ -1915,13 +1921,14 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop diopiError_t diopiBCEWithLogits(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t pos_weight, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; - c10::optional atPosWeight = pos_weight ? c10::optional(impl::aten::buildATen(pos_weight)) : c10::nullopt; + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atTarget = impl::aten::buildATen(target); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atPosWeight, pos_weight); - at::binary_cross_entropy_with_logits_out(atOut, atInput, atTarget, atWeight, atPosWeight, reduction); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(binary_cross_entropy_with_logits_out, atOut, atInput, atTarget, atWeight, atPosWeight, reduction); return diopiSuccess; } @@ -1933,7 +1940,7 @@ diopiError_t diopiHardtanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, di auto atMin = impl::aten::buildAtScalar(min_val); auto atMax = impl::aten::buildAtScalar(max_val); auto atOut = impl::aten::buildATen(out); - at::hardtanh_out(atOut, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(hardtanh_out, atOut, atInput, atMin, atMax); return diopiSuccess; } @@ -1950,15 +1957,15 @@ diopiError_t diopiHardtanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu diopiError_t diopiHardswish(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::hardswish_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(hardswish_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiHardswishInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::hardswish_(atInput); return diopiSuccess; } @@ -1969,7 +1976,8 @@ diopiError_t diopiHardswishBackward(diopiContextHandle_t ctx, diopiTensorHandle_ auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::hardswish_backward_out(atGradInput, atGradOutput, atInput); + auto tempOut = CALL_ATEN_CUDA_FUNC(hardswish_backward, atGradOutput, atInput); + at::native::copy_(atGradInput, tempOut, true); return diopiSuccess; } @@ -1981,7 +1989,7 @@ diopiError_t diopiThreshold(diopiContextHandle_t ctx, diopiTensorHandle_t out, d auto atThreshold = impl::aten::buildAtScalar(threshold); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::threshold_out(atOut, atInput, atThreshold, atValue); + CALL_ATEN_CUDA_FUNC(threshold_out, atOut, atInput, atThreshold, atValue); return diopiSuccess; } @@ -2001,7 +2009,7 @@ diopiError_t diopiGelu(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); c10::string_view atApproximate(approximate, strlen(approximate)); - at::gelu_out(atOut, atInput, atApproximate); + CALL_ATEN_CUDA_FUNC(gelu_out, atOut, atInput, atApproximate); return diopiSuccess; } @@ -2032,9 +2040,11 @@ diopiError_t diopiNLLLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio } if (dim >= 3) { - at::nll_loss2d_out(atOut, atInput, atTarget, atWeight, reduction, ignore_index); + at::Tensor total_weight = at::empty({0}, atInput.options()); + CALL_ATEN_CUDA_FUNC(nll_loss2d_forward_out, atOut, total_weight, atInput, atTarget, atWeight, reduction, ignore_index); } else { - at::nll_loss_out(atOut, atInput, atTarget, atWeight, reduction, ignore_index); + at::Tensor total_weight = at::empty({0}, atInput.options()); + CALL_ATEN_CUDA_FUNC(nll_loss_forward_out, atOut, total_weight, atInput, atTarget, atWeight, reduction, ignore_index); } return diopiSuccess; @@ -2044,9 +2054,10 @@ diopiError_t diopiSliceBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gr int64_t dim, int64_t start, int64_t end, int64_t step) { impl::aten::setCurStream(ctx); at::IntArrayRef atInputSizes = impl::aten::buildAtIntArray(input_sizes); - at::Tensor atGradOutput = impl::aten::buildATen(grad_output); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::slice_backward_out(atGradInput, atGradOutput, atInputSizes, dim, start, end, step); + auto atGradOutput = impl::aten::buildATen(grad_output); + auto atGradInput = impl::aten::buildATen(grad_input); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(slice_backward_out, atGradInput, atGradOutput, atInputSizes, dim, start, end, step); return diopiSuccess; } @@ -2055,20 +2066,12 @@ diopiError_t diopiIndexBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gr diopiConstTensorHandle_t* indices, int64_t nums, diopiConstTensorHandle_t grad) { impl::aten::setCurStream(ctx); DIOPI_CHECK_PTR(indices); - at::Tensor atZerosInput = impl::aten::buildATen(zeros_like_input); - at::Tensor atGrad = impl::aten::buildATen(grad); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); - c10::List> vecIdx; - vecIdx.reserve(nums); - for (size_t i = 0; i < nums; ++i) { - if (indices[i] == nullptr) { - vecIdx.emplace_back(c10::nullopt); - } else { - at::Tensor atIndex = impl::aten::buildATen(indices[i]); - vecIdx.emplace_back(atIndex); - } - } - at::_index_put_impl_out(atGradInput, atZerosInput, vecIdx, atGrad, true, true); + auto atZerosInput = impl::aten::buildATen(zeros_like_input); + auto atGrad = impl::aten::buildATen(grad); + auto atGradInput = impl::aten::buildATen(grad_input); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL_LIST(vecIdx, indices, nums); + at::native::copy_(atGradInput, atZerosInput, true); + CALL_ATEN_CUDA_FUNC(_index_put_impl_, atGradInput, vecIdx, atGrad, true, true); return diopiSuccess; } @@ -2077,10 +2080,10 @@ diopiError_t diopiSigmoidFocalLossBackward(diopiContextHandle_t ctx, diopiTensor diopiConstTensorHandle_t target, diopiTensorHandle_t grad_input, float gamma, float alpha, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); - at::Tensor atGrad = impl::aten::buildATen(grad_output); - at::Tensor atGradOutput = at::empty_like(atInput); + auto atInput = impl::aten::buildATen(input); + auto atTarget = impl::aten::buildATen(target); + auto atGrad = impl::aten::buildATen(grad_output); + auto atGradOutput = at::empty_like(atInput); if (reduction == 1) { atGradOutput.copy_(atGrad.expand_as(atInput) / atInput.numel()); } else if (reduction == 2) { @@ -2092,12 +2095,12 @@ diopiError_t diopiSigmoidFocalLossBackward(diopiContextHandle_t ctx, diopiTensor return diopiErrorOccurred; } - at::Tensor atP = at::sigmoid(atInput); + auto atP = at::sigmoid(atInput); // (1-p)**g * (1 - p - g*p*log(p)) - at::Tensor atTerm1 = at::pow(1 - atP, gamma) * (1 - atP - gamma * atP * at::log(at::clamp_min(atP, FLT_MIN))); + auto atTerm1 = at::pow(1 - atP, gamma) * (1 - atP - gamma * atP * at::log(at::clamp_min(atP, FLT_MIN))); // (p**g) * (g*(1-p)*log(1-p) - p) - at::Tensor atTerm2 = at::pow(atP, gamma) * (gamma * (1 - atP) * at::log(at::clamp_min(1 - atP, FLT_MIN)) - atP); - at::Tensor atRes = -atTarget * atTerm1 * alpha - (1 - atTarget) * atTerm2 * (1 - alpha); + auto atTerm2 = at::pow(atP, gamma) * (gamma * (1 - atP) * at::log(at::clamp_min(1 - atP, FLT_MIN)) - atP); + auto atRes = -atTarget * atTerm1 * alpha - (1 - atTarget) * atTerm2 * (1 - alpha); atGradOutput *= atRes; impl::aten::updateATen2Tensor(ctx, atGradOutput, grad_input); @@ -2131,8 +2134,8 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan #ifdef USE_HIP diopi_tensor_list vecOut = {grad_input, grad_weight}; auto grad_input_mask = std::array{true, true, false}; - impl::aten::invokeATenFuncRet( - ctx, at::miopen_convolution_backward, vecOut, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); + auto atOut = CALL_ATEN_FUNC(miopen_convolution_backward, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); + updateATen2Tensor(ctx, atOut, vecOut); if (bias_sizes && grad_bias) { auto atGradBias = impl::aten::buildATen(grad_bias); at::Tensor atTmp = atGrad; @@ -2141,7 +2144,7 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } #else std::vector outputPadding(padding.len, 0); @@ -2152,20 +2155,11 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan auto atGradInput = impl::aten::buildATen(grad_input); auto atGradWeight = impl::aten::buildATen(grad_weight); auto atGradBias = impl::aten::buildATen(grad_bias); - at::convolution_backward_out(atGradInput, - atGradWeight, - atGradBias, - atGrad, - atInput, - atWeight, - atBiasSizes, - atStride, - atPadding, - atDilation, - false, - outputPadding, - groups, - {true, true, true}); + auto tempOut = CALL_ATEN_CUDA_FUNC( + convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, true}); + at::native::copy_(atGradInput, std::get<0>(tempOut), true); + at::native::copy_(atGradWeight, std::get<1>(tempOut), true); + at::native::copy_(atGradBias, std::get<2>(tempOut), true); } else { auto results = at::convolution_backward( atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, false}); @@ -2179,7 +2173,7 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -2201,20 +2195,19 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH auto atDilation = impl::aten::buildAtIntArray(dilation); #ifdef USE_HIP auto grad_input_mask = std::array{true, true, false}; - impl::aten::invokeATenFuncRet(ctx, - at::miopen_convolution_transpose_backward, - vecOut, - atInput, - atGrad, - atWeight, - atPadding, - atOutputPadding, - atStride, - atDilation, - groups, - false, - false, - grad_input_mask); + auto atOut = CALL_ATEN_FUNC(miopen_convolution_transpose_backward, + atInput, + atGrad, + atWeight, + atPadding, + atOutputPadding, + atStride, + atDilation, + groups, + false, + false, + grad_input_mask); + updateATen2Tensor(ctx, atOut, vecOut); if (bias_sizes != nullptr && grad_bias != nullptr) { auto atGradBias = impl::aten::buildATen(grad_bias); at::Tensor atTmp = atGrad; @@ -2223,7 +2216,7 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } #else if (grad_input && grad_weight && grad_bias && bias_sizes) { @@ -2231,20 +2224,11 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH auto atGradInput = impl::aten::buildATen(grad_input); auto atGradWeight = impl::aten::buildATen(grad_weight); auto atGradBias = impl::aten::buildATen(grad_bias); - at::convolution_backward_out(atGradInput, - atGradWeight, - atGradBias, - atGrad, - atInput, - atWeight, - atBiasSizes, - atStride, - atPadding, - atDilation, - true, - atOutputPadding, - groups, - {true, true, true}); + auto tempOut = CALL_ATEN_CUDA_FUNC( + convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, true, atOutputPadding, groups, {true, true, true}); + at::native::copy_(atGradInput, std::get<0>(tempOut), true); + at::native::copy_(atGradWeight, std::get<1>(tempOut), true); + at::native::copy_(atGradBias, std::get<2>(tempOut), true); } else { auto grad_inputs = at::convolution_backward( atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, true, atOutputPadding, groups, {true, true, false}); @@ -2258,7 +2242,7 @@ diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorH atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -2271,7 +2255,8 @@ diopiError_t diopiEmbeddingBackward(diopiContextHandle_t ctx, diopiTensorHandle_ impl::aten::setCurStream(ctx); auto atGrad = impl::aten::buildATen(grad); auto atIndices = impl::aten::buildATen(indices); - impl::aten::invokeATenFuncRet(ctx, at::embedding_backward, out, atGrad, atIndices, numWeights, paddingIdx, scaleGradByFreq, sparse); + auto atOut = CALL_ATEN_FUNC(embedding_backward, atGrad, atIndices, numWeights, paddingIdx, scaleGradByFreq, sparse); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -2282,7 +2267,7 @@ diopiError_t diopiAdaptiveAvgPool2dBackward(diopiContextHandle_t ctx, diopiTenso auto atGradInput = impl::aten::buildATen(grad_input); auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); - at::_adaptive_avg_pool2d_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_FUNC(_adaptive_avg_pool2d_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } @@ -2294,7 +2279,7 @@ diopiError_t diopiLeakyReluBackward(diopiContextHandle_t ctx, diopiTensorHandle_ auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atSlope = impl::aten::buildAtScalar(negative_slope); - at::leaky_relu_backward_out(atGradInput, atGradOutput, atInput, atSlope, input_is_result); + CALL_ATEN_CUDA_FUNC(leaky_relu_backward_out, atGradInput, atGradOutput, atInput, atSlope, input_is_result); return diopiSuccess; } @@ -2307,7 +2292,7 @@ diopiError_t diopiHardtanhBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atMin = impl::aten::buildAtScalar(min_val); auto atMax = impl::aten::buildAtScalar(max_val); auto atGradInput = impl::aten::buildATen(grad_input); - at::hardtanh_backward_out(atGradInput, atGradOutput, atInput, atMin, atMax); + CALL_ATEN_CUDA_FUNC(hardtanh_backward_out, atGradInput, atGradOutput, atInput, atMin, atMax); return diopiSuccess; } @@ -2319,7 +2304,8 @@ diopiError_t diopiGeluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gra auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); c10::string_view atApproximate(approximate, strlen(approximate)); - impl::aten::invokeATenFuncRet(ctx, at::gelu_backward, grad_input, atGradOutput, atInput, atApproximate); + auto atOut = CALL_ATEN_CUDA_FUNC(gelu_backward, atGradOutput, atInput, atApproximate); + impl::aten::updateATen2Tensor(ctx, atOut, grad_input); return diopiSuccess; } @@ -2335,7 +2321,8 @@ diopiError_t diopiAvgPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); c10::optional atDivisorOverride = divisor_override ? c10::optional(*divisor_override) : c10::nullopt; auto atGradInput = impl::aten::buildATen(grad_input); - at::avg_pool2d_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); + CALL_ATEN_CUDA_FUNC( + avg_pool2d_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride); return diopiSuccess; } @@ -2347,7 +2334,7 @@ diopiError_t diopiMSELossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); auto atGradInput = impl::aten::buildATen(grad_input); - at::mse_loss_backward_out(atGradInput, atGradOutput, atInput, atTarget, reduction); + CALL_ATEN_CUDA_FUNC(mse_loss_backward_out, atGradInput, atGradOutput, atInput, atTarget, reduction); return diopiSuccess; } @@ -2357,7 +2344,7 @@ diopiError_t diopiTanhBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gra auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atGradInput = impl::aten::buildATen(grad_input); - at::tanh_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(tanh_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } @@ -2368,7 +2355,8 @@ diopiError_t diopiIndexSelectBackward(diopiContextHandle_t ctx, diopiTensorHandl auto atGrad = impl::aten::buildATen(grad); at::IntArrayRef atInputSize = impl::aten::buildAtIntArray(input_sizes); auto atIndex = impl::aten::buildATen(index); - impl::aten::invokeATenFuncRet(ctx, at::index_select_backward, grad_input, atGrad, atInputSize, dim, atIndex); + auto atOut = CALL_ATEN_FUNC(index_select_backward, atGrad, atInputSize, dim, atIndex); + impl::aten::updateATen2Tensor(ctx, atOut, grad_input); return diopiSuccess; } @@ -2378,7 +2366,8 @@ diopiError_t diopiSelectBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g impl::aten::setCurStream(ctx); auto atGradOutput = impl::aten::buildATen(grad_output); at::IntArrayRef atInputSize = impl::aten::buildAtIntArray(input_sizes); - impl::aten::invokeATenFuncRet(ctx, at::select_backward, grad_input, atGradOutput, atInputSize, dim, index); + auto atOut = CALL_ATEN_FUNC(select_backward, atGradOutput, atInputSize, dim, index); + impl::aten::updateATen2Tensor(ctx, atOut, grad_input); return diopiSuccess; } @@ -2390,7 +2379,7 @@ diopiError_t diopiSoftmaxBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); // TODO(huqingqing): use default type instead - at::_softmax_backward_data_out(atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); + CALL_ATEN_CUDA_FUNC(_softmax_backward_data_out, atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); return diopiSuccess; } @@ -2402,7 +2391,7 @@ diopiError_t diopiLogSoftmaxBackward(diopiContextHandle_t ctx, diopiTensorHandle auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); // TODO(huqingqing): use default type instead - at::_log_softmax_backward_data_out(atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); + CALL_ATEN_CUDA_FUNC(_log_softmax_backward_data_out, atGradInput, atGradOutput, atOutput, dim, atOutput.scalar_type()); return diopiSuccess; } @@ -2413,7 +2402,7 @@ diopiError_t diopiSigmoidBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atGradOutput = impl::aten::buildATen(grad_output); auto atOutput = impl::aten::buildATen(output); auto atGradInput = impl::aten::buildATen(grad_input); - at::sigmoid_backward_out(atGradInput, atGradOutput, atOutput); + CALL_ATEN_CUDA_FUNC(sigmoid_backward_out, atGradInput, atGradOutput, atOutput); return diopiSuccess; } @@ -2425,7 +2414,7 @@ diopiError_t diopiThresholdBackward(diopiContextHandle_t ctx, diopiTensorHandle_ auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atThreshold = impl::aten::buildAtScalar(threshold); - at::threshold_backward_out(atGradInput, atGradOutput, atInput, atThreshold); + CALL_ATEN_CUDA_FUNC(threshold_backward_out, atGradInput, atGradOutput, atInput, atThreshold); return diopiSuccess; } @@ -2486,7 +2475,8 @@ diopiError_t diopiMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); auto atIndices = impl::aten::buildATen(indices); auto atGradInput = impl::aten::buildATen(grad_input); - at::max_pool2d_with_indices_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); + CALL_ATEN_CUDA_FUNC( + max_pool2d_with_indices_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); return diopiSuccess; } @@ -2500,10 +2490,10 @@ diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atWeight = impl::aten::buildATen(weight); - c10::optional atRunningMean = running_mean ? c10::optional(impl::aten::buildATen(running_mean)) : c10::nullopt; - c10::optional atRunningVar = running_var ? c10::optional(impl::aten::buildATen(running_var)) : c10::nullopt; - c10::optional atSaveMean = save_mean ? c10::optional(impl::aten::buildATen(save_mean)) : c10::nullopt; - c10::optional atSaveVar = save_invstd ? c10::optional(impl::aten::buildATen(save_invstd)) : c10::nullopt; + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atRunningMean, running_mean); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atRunningVar, running_var); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atSaveMean, save_mean); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atSaveVar, save_invstd); if (grad_input && grad_weight && grad_bias) { auto grad_input_mask = std::array{true, true, true}; @@ -2547,7 +2537,7 @@ diopiError_t diopiArange(diopiContextHandle_t ctx, diopiTensorHandle_t out, cons auto atStart = impl::aten::buildAtScalar(start); auto atEnd = impl::aten::buildAtScalar(end); auto atStep = impl::aten::buildAtScalar(step); - at::arange_out(atOut, atStart, atEnd, atStep); + CALL_ATEN_CUDA_FUNC(arange_out, atOut, atStart, atEnd, atStep); return diopiSuccess; } @@ -2556,7 +2546,7 @@ diopiError_t diopiRandperm(diopiContextHandle_t ctx, diopiTensorHandle_t out, in impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::randperm_out(atOut, n, gen); + CALL_ATEN_CUDA_FUNC(randperm_out, atOut, n, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2587,7 +2577,7 @@ diopiError_t diopiBernoulliInp(diopiContextHandle_t ctx, diopiTensorHandle_t ino impl::aten::setCurStream(ctx); auto atInOut = impl::aten::buildATen(inout); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::bernoulli_out(atInOut, atInOut, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_out, atInOut, atInOut, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2598,7 +2588,7 @@ diopiError_t diopiBernoulli(diopiContextHandle_t ctx, diopiTensorHandle_t out, d auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::bernoulli_out(atOut, atInput, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_out, atOut, atInput, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2608,7 +2598,7 @@ diopiError_t diopiBernoulliScalar(diopiContextHandle_t ctx, diopiTensorHandle_t impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::native::bernoulli_(atOut, p, gen); + CALL_ATEN_CUDA_FUNC(bernoulli_, atOut, p, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2619,7 +2609,7 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub auto atOut = impl::aten::buildATen(out); auto atSize = atOut.sizes(); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, mean, std, atSize, gen); + CALL_ATEN_FUNC(normal_out, atOut, mean, std, atSize, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2640,7 +2630,7 @@ diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle auto atOut = impl::aten::buildATen(out); auto atMean = impl::aten::buildATen(mean); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, atMean, std, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, atMean, std, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2652,7 +2642,7 @@ diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle auto atOut = impl::aten::buildATen(out); auto atStd = impl::aten::buildATen(std); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, mean, atStd, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, mean, atStd, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2665,7 +2655,7 @@ diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out auto atMean = impl::aten::buildATen(mean); auto atStd = impl::aten::buildATen(std); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::normal_out(atOut, atMean, atStd, gen); + CALL_ATEN_CUDA_FUNC(normal_out, atOut, atMean, atStd, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -2678,7 +2668,8 @@ diopiError_t diopiMaskedFill(diopiContextHandle_t ctx, diopiTensorHandle_t out, auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildATen(value); auto atOut = impl::aten::buildATen(out); - at::masked_fill_out(atOut, atInput, atMask, atValue); + at::native::copy_(atOut, atInput, true); + CALL_ATEN_CUDA_FUNC(masked_fill_, atOut, atMask, atValue); return diopiSuccess; } @@ -2688,7 +2679,7 @@ diopiError_t diopiMaskedFillInp(diopiContextHandle_t ctx, diopiTensorHandle_t in auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildATen(value); - atInput.masked_fill_(atMask, atValue); + CALL_ATEN_CUDA_FUNC(masked_fill_, atInput, atMask, atValue); return diopiSuccess; } @@ -2700,7 +2691,8 @@ diopiError_t diopiMaskedFillScalar(diopiContextHandle_t ctx, diopiTensorHandle_t auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::masked_fill_out(atOut, atInput, atMask, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(masked_fill_out, atOut, atInput, atMask, atValue); return diopiSuccess; } @@ -2710,7 +2702,7 @@ diopiError_t diopiMaskedFillInpScalar(diopiContextHandle_t ctx, diopiTensorHandl auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); auto atValue = impl::aten::buildAtScalar(value); - atInput.masked_fill_(atMask, atValue); + CALL_ATEN_CUDA_FUNC(masked_fill_, atInput, atMask, atValue); return diopiSuccess; } @@ -2720,8 +2712,8 @@ diopiError_t diopiMeshGrid(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, DIOPI_CHECK_PTR(outs); DIOPI_CHECK_PTR(inputs); auto outsNum = inputsNum; - auto atInputs = impl::aten::buildATenList(inputs, inputsNum); - auto atOuts = impl::aten::buildATenList(outs, outsNum); + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputsNum); + DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, outsNum); atOuts = at::meshgrid(atInputs); for (int i = 0; i < outsNum; ++i) { impl::aten::updateATen2Tensor(ctx, atOuts[i].contiguous(), outs[i]); @@ -2748,7 +2740,7 @@ diopiError_t diopiAdamW(diopiContextHandle_t ctx, diopiTensorHandle_t param, dio auto bias_correction1 = 1 - pow(beta1, step); auto bias_correction2 = 1 - pow(beta2, step); if (amsgrad) { - at::maximum_out(atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); + CALL_ATEN_CUDA_FUNC(maximum_out, atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); denom = atMaxExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); } else { denom = atExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); @@ -2780,7 +2772,7 @@ diopiError_t diopiAdam(diopiContextHandle_t ctx, diopiTensorHandle_t param, diop auto bias_correction1 = 1 - pow(beta1, step); auto bias_correction2 = 1 - pow(beta2, step); if (amsgrad) { - at::maximum_out(atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); + CALL_ATEN_CUDA_FUNC(maximum_out, atMaxExpAvgSq, atMaxExpAvgSq, atExpAvgSq); denom = atMaxExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); } else { denom = atExpAvgSq.sqrt().div_(sqrt(bias_correction2)).add_(eps); @@ -2857,7 +2849,8 @@ diopiError_t diopiConvTranspose2d(diopiContextHandle_t ctx, diopiTensorHandle_t auto atPadding = impl::aten::buildAtIntArray(padding); auto atOutputPadding = impl::aten::buildAtIntArray(output_padding); auto atDilation = impl::aten::buildAtIntArray(dilation); - impl::aten::invokeATenFuncRet(ctx, at::conv_transpose2d, out, atInput, atWeight, atBias, atStride, atPadding, atOutputPadding, groups, atDilation); + auto atOut = CALL_ATEN_FUNC(conv_transpose2d, atInput, atWeight, atBias, atStride, atPadding, atOutputPadding, groups, atDilation); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -2866,7 +2859,7 @@ diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::cumsum_out(atOut, atInput, dim); + CALL_ATEN_CUDA_FUNC(cumsum_out, atOut, atInput, dim); return diopiSuccess; } @@ -2877,7 +2870,8 @@ diopiError_t diopiCdist(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi auto atInput1 = impl::aten::buildATen(input1); auto atInput2 = impl::aten::buildATen(input2); c10::optional atComputMode = compute_mode ? c10::optional(*compute_mode) : c10::nullopt; - impl::aten::invokeATenFuncRet(ctx, at::cdist, out, atInput1, atInput2, p, atComputMode); + auto atOut = CALL_ATEN_FUNC(cdist, atInput1, atInput2, p, atComputMode); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -2890,7 +2884,7 @@ diopiError_t diopiCdistBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gr auto atInput1 = impl::aten::buildATen(input1); auto atInput2 = impl::aten::buildATen(input2); auto atCdist = impl::aten::buildATen(cdist); - at::_cdist_backward_out(atGradInput, atGradOutput, atInput1, atInput2, p, atCdist); + CALL_ATEN_FUNC(_cdist_backward_out, atGradInput, atGradOutput, atInput1, atInput2, p, atCdist); return diopiSuccess; } @@ -2899,7 +2893,7 @@ diopiError_t diopiReciprocal(diopiContextHandle_t ctx, diopiTensorHandle_t out, impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::reciprocal_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(reciprocal_out, atOut, atInput); return diopiSuccess; } @@ -2916,7 +2910,7 @@ diopiError_t diopiBitwiseNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::bitwise_not_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(bitwise_not_out, atOut, atInput); return diopiSuccess; } @@ -2924,7 +2918,7 @@ diopiError_t diopiBitwiseNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiError_t diopiBitwiseNotInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - atInput.bitwise_not_(); + CALL_ATEN_CUDA_FUNC(bitwise_not_, atInput); return diopiSuccess; } @@ -2933,7 +2927,7 @@ diopiError_t diopiLogicalNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::logical_not_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(logical_not_out, atOut, atInput); return diopiSuccess; } @@ -2941,17 +2935,17 @@ diopiError_t diopiLogicalNot(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiError_t diopiLogicalNotInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - atInput.logical_not_(); + CALL_ATEN_CUDA_FUNC(logical_not_out, atInput, atInput); return diopiSuccess; } diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim, bool keepdim) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); + auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); c10::optional atDim = dim ? c10::optional(*dim) : c10::nullopt; - at::argmax_out(atOut, atInput, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(argmax_out, atOut, atInput, atDim, keepdim); return diopiSuccess; } @@ -2959,13 +2953,14 @@ diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiSmoothL1Loss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiReduction_t reduction, double beta) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); + auto atInput = impl::aten::buildATen(input); + auto atTarget = impl::aten::buildATen(target); if (reduction == 0) { - at::Tensor atOut = impl::aten::buildATen(out); - at::smooth_l1_loss_out(atOut, atInput, atTarget, reduction, beta); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(smooth_l1_loss_out, atOut, atInput, atTarget, reduction, beta); } else { - impl::aten::invokeATenFuncRet(ctx, at::smooth_l1_loss, out, atInput, atTarget, reduction, beta); + auto atOut = CALL_ATEN_FUNC(smooth_l1_loss, atInput, atTarget, reduction, beta); + impl::aten::updateATen2Tensor(ctx, atOut, out); } return diopiSuccess; @@ -2978,7 +2973,7 @@ diopiError_t diopiSmoothL1LossBackward(diopiContextHandle_t ctx, diopiTensorHand auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); auto atGradInput = impl::aten::buildATen(grad_input); - at::smooth_l1_loss_backward_out(atGradInput, atGradOutput, atInput, atTarget, reduction, beta); + CALL_ATEN_CUDA_FUNC(smooth_l1_loss_backward_out, atGradInput, atGradOutput, atInput, atTarget, reduction, beta); return diopiSuccess; } @@ -2988,7 +2983,7 @@ diopiError_t diopiMaximum(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::maximum_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(maximum_out, atOut, atInput, atOther); return diopiSuccess; } @@ -2998,7 +2993,7 @@ diopiError_t diopiMinimum(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::minimum_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(minimum_out, atOut, atInput, atOther); return diopiSuccess; } @@ -3008,7 +3003,7 @@ diopiError_t diopiMm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiCon auto atInput = impl::aten::buildATen(input); auto atMat2 = impl::aten::buildATen(mat2); auto atOut = impl::aten::buildATen(out); - at::mm_out(atOut, atInput, atMat2); + CALL_ATEN_CUDA_FUNC(mm_out, atOut, atInput, atMat2); return diopiSuccess; } @@ -3023,7 +3018,7 @@ diopiError_t diopiConvolution3d(diopiContextHandle_t ctx, diopiTensorHandle_t ou auto atStride = impl::aten::buildAtIntArray(stride); auto atPadding = impl::aten::buildAtIntArray(padding); auto atDilation = impl::aten::buildAtIntArray(dilation); - at::convolution_out(atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); + CALL_ATEN_FUNC(convolution_out, atOut, atInput, atWeight, atBias, atStride, atPadding, atDilation, false, at::IntArrayRef(0), groups); return diopiSuccess; } @@ -3044,8 +3039,8 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan diopi_tensor_list vecOut = {grad_input, grad_weight}; #ifdef USE_HIP auto grad_input_mask = std::array{true, true, false}; - impl::aten::invokeATenFuncRet( - ctx, at::miopen_convolution_backward, vecOut, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); + auto atOut = CALL_ATEN_FUNC(miopen_convolution_backward, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask); + updateATen2Tensor(ctx, atOut, vecOut); if (bias_sizes != nullptr && grad_bias != nullptr) { auto atBias = impl::aten::buildATen(grad_bias); at::Tensor atTmp = atGrad; @@ -3065,20 +3060,11 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan auto atGradInput = impl::aten::buildATen(grad_input); auto atGradWeight = impl::aten::buildATen(grad_weight); auto atGradBias = impl::aten::buildATen(grad_bias); - at::convolution_backward_out(atGradInput, - atGradWeight, - atGradBias, - atGrad, - atInput, - atWeight, - atBiasSizes, - atStride, - atPadding, - atDilation, - false, - atOutputPadding, - groups, - {true, true, true}); + auto tempOut = CALL_ATEN_CUDA_FUNC( + convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, false, atOutputPadding, groups, {true, true, true}); + at::native::copy_(atGradInput, std::get<0>(tempOut), true); + at::native::copy_(atGradWeight, std::get<1>(tempOut), true); + at::native::copy_(atGradBias, std::get<2>(tempOut), true); } else { auto grad_inputs = at::convolution_backward( atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, false, atOutputPadding, groups, {true, true, false}); @@ -3093,7 +3079,7 @@ diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diopiTensorHan atTmp = at::sum(atTmp, -1, false); size -= 1; } - at::sum_out(atGradBias, atTmp, 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false); } } #endif @@ -3129,7 +3115,8 @@ diopiError_t diopiUnfoldBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g auto atGradInput = impl::aten::buildATen(grad_input); auto atGrad = impl::aten::buildATen(grad_output); auto atInputSize = impl::aten::buildAtIntArray(input_sizes); - at::unfold_backward_out(atGradInput, atGrad, atInputSize, dim, size, step); + auto tempOut = CALL_ATEN_CUDA_FUNC(unfold_backward, atGrad, atInputSize, dim, size, step); + at::native::copy_(atGradInput, tempOut, true); return diopiSuccess; } @@ -3139,7 +3126,7 @@ diopiError_t diopiMaskedSelect(diopiContextHandle_t ctx, diopiTensorHandle_t* ou DIOPI_CHECK_PTR(out); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); - auto atOut = at::masked_select(atInput, atMask); + auto atOut = CALL_ATEN_CUDA_FUNC(masked_select, atInput, atMask); impl::aten::buildDiopiTensor(ctx, atOut, out); return diopiSuccess; @@ -3151,7 +3138,8 @@ diopiError_t diopiMaskedSelectBackward(diopiContextHandle_t ctx, diopiTensorHand auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atMask = impl::aten::buildATen(mask); - impl::aten::invokeATenFuncRet(ctx, at::masked_select_backward, grad_input, atGradOutput, atInput, atMask); + auto atOut = CALL_ATEN_FUNC(masked_select_backward, atGradOutput, atInput, atMask); + impl::aten::updateATen2Tensor(ctx, atOut, grad_input); return diopiSuccess; } @@ -3163,7 +3151,8 @@ diopiError_t diopiIndexFillScalar(diopiContextHandle_t ctx, diopiTensorHandle_t auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildAtScalar(value); auto atOut = impl::aten::buildATen(out); - at::index_fill_out(atOut, atInput, dim, atIndex, atValue); + at::native::copy_(atOut, atInput, true); + CALL_ATEN_CUDA_FUNC(index_fill_, atOut, dim, atIndex, atValue); return diopiSuccess; } @@ -3175,7 +3164,8 @@ diopiError_t diopiIndexFill(diopiContextHandle_t ctx, diopiTensorHandle_t out, d auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildATen(value); auto atOut = impl::aten::buildATen(out); - at::index_fill_out(atOut, atInput, dim, atIndex, atValue); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(index_fill_out, atOut, atInput, dim, atIndex, atValue); return diopiSuccess; } @@ -3186,7 +3176,7 @@ diopiError_t diopiIndexFillInpScalar(diopiContextHandle_t ctx, diopiTensorHandle auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildAtScalar(value); - atInput.index_fill_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(index_fill_, atInput, dim, atIndex, atValue); return diopiSuccess; } @@ -3197,7 +3187,7 @@ diopiError_t diopiIndexFillInp(diopiContextHandle_t ctx, diopiTensorHandle_t inp auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atValue = impl::aten::buildATen(value); - atInput.index_fill_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(index_fill_, atInput, dim, atIndex, atValue); return diopiSuccess; } @@ -3207,8 +3197,8 @@ diopiError_t diopiLinspace(diopiContextHandle_t ctx, diopiTensorHandle_t out, co auto atStart = impl::aten::buildAtScalar(start); auto atEnd = impl::aten::buildAtScalar(end); c10::optional atStep(steps); - at::Tensor atOut = impl::aten::buildATen(out); - at::linspace_out(atOut, atStart, atEnd, steps); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(linspace_out, atOut, atStart, atEnd, steps); return diopiSuccess; } @@ -3219,7 +3209,8 @@ diopiError_t diopiRoll(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC at::IntArrayRef atShifts = impl::aten::buildAtIntArray(shifts); at::IntArrayRef atDims = impl::aten::buildAtIntArray(dims); auto atOut = impl::aten::buildATen(out); - at::roll_out(atOut, atInput, atShifts, atDims); + auto tempOut = CALL_ATEN_CUDA_FUNC(roll, atInput, atShifts, atDims); + at::native::copy_(atOut, tempOut, true); return diopiSuccess; } @@ -3234,7 +3225,7 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::norm_out(atOut, atInput, atP, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(norm_out, atOut, atInput, atP, atDim, keepdim); return diopiSuccess; } @@ -3242,17 +3233,20 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t save_mean, diopiTensorHandle_t save_invstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t num_groups, double eps) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atWeight = impl::aten::buildATen(weight); - at::Tensor atBias = impl::aten::buildATen(bias); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atSaveMean = impl::aten::buildATen(save_mean); - at::Tensor atSaveInvstd = impl::aten::buildATen(save_invstd); + auto atInput = impl::aten::buildATen(input); + auto atWeight = impl::aten::buildATen(weight); + auto atBias = impl::aten::buildATen(bias); + auto atOut = impl::aten::buildATen(out); + auto atSaveMean = impl::aten::buildATen(save_mean); + auto atSaveInvstd = impl::aten::buildATen(save_invstd); const int64_t N = atInput.size(0); const int64_t C = atInput.size(1); const auto input_shape = atInput.sizes(); const int64_t HxW = c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend()); - at::native_group_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atWeight, atBias, N, C, HxW, num_groups, eps); + auto tempOut = CALL_ATEN_CUDA_FUNC(native_group_norm, atInput, atWeight, atBias, N, C, HxW, num_groups, eps); + at::native::copy_(atOut, std::get<0>(tempOut), true); + at::native::copy_(atSaveMean, std::get<1>(tempOut), true); + at::native::copy_(atSaveInvstd, std::get<2>(tempOut), true); return diopiSuccess; } @@ -3290,14 +3284,15 @@ diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiBCELoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target, diopiConstTensorHandle_t weight, diopiReduction_t reduction) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atTarget = impl::aten::buildATen(target); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atTarget = impl::aten::buildATen(target); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight); if (reduction == 0) { - at::binary_cross_entropy_out(atOut, atInput, atTarget, atWeight, reduction); + CALL_ATEN_CUDA_FUNC(binary_cross_entropy_out, atOut, atInput, atTarget, atWeight, reduction); } else { - impl::aten::invokeATenFuncRet(ctx, at::binary_cross_entropy, out, atInput, atTarget, atWeight, reduction); + auto atOut = CALL_ATEN_CUDA_FUNC(binary_cross_entropy, atInput, atTarget, atWeight, reduction); + impl::aten::updateATen2Tensor(ctx, atOut, out); } return diopiSuccess; @@ -3310,9 +3305,9 @@ diopiError_t diopiBCELossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atTarget = impl::aten::buildATen(target); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight); auto atGradInput = impl::aten::buildATen(grad_input); - at::binary_cross_entropy_backward_out(atGradInput, atGradOutput, atInput, atTarget, atWeight, reduction); + CALL_ATEN_CUDA_FUNC(binary_cross_entropy_backward_out, atGradInput, atGradOutput, atInput, atTarget, atWeight, reduction); return diopiSuccess; } @@ -3321,18 +3316,19 @@ diopiError_t diopiLayerNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiSize_t normalized_shape, double eps) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atSaveMean = impl::aten::buildATen(save_mean); - at::Tensor atSaveInvstd = impl::aten::buildATen(save_invstd); + auto atOut = impl::aten::buildATen(out); + auto atSaveMean = impl::aten::buildATen(save_mean); + auto atSaveInvstd = impl::aten::buildATen(save_invstd); - at::Tensor atInput = impl::aten::buildATen(input); - c10::optional atWeight = weight ? c10::optional(impl::aten::buildATen(weight)) : c10::nullopt; - c10::optional atBias = bias ? c10::optional(impl::aten::buildATen(bias)) : c10::nullopt; + auto atInput = impl::aten::buildATen(input); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atBias, bias); auto atNormalizedShape = impl::aten::buildAtIntArray(normalized_shape); // TODO(zhaoguochun): check dtype: when input is half, atSaveInvstd, atInput should be float? - // at::native_layer_norm_out(atOut, atSaveMean, atSaveInvstd, atInput, atNormalizedShape, atWeight, atBias, eps); + // CALL_ATEN_CUDA_FUNC(native_layer_norm_out, atOut, atSaveMean, atSaveInvstd, atInput, atNormalizedShape, atWeight, atBias, eps); diopi_tensor_list vecOut = {out, save_mean, save_invstd}; - impl::aten::invokeATenFuncRet(ctx, at::native_layer_norm, vecOut, atInput, atNormalizedShape, atWeight, atBias, eps); + auto Out = CALL_ATEN_CUDA_FUNC(native_layer_norm, atInput, atNormalizedShape, atWeight, atBias, eps); + impl::aten::updateATen2Tensor(ctx, Out, vecOut); return diopiSuccess; } @@ -3348,17 +3344,9 @@ diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atNormalizedShape = impl::aten::buildAtIntArray(normalized_shape); - c10::optional atWeight; - c10::optional atBias; - auto grad_input_mask = std::array{true, false, false}; - if (weight != nullptr) { - atWeight = c10::optional(impl::aten::buildATen(weight)); - grad_input_mask.at(1) = true; - } - if (bias != nullptr) { - atBias = c10::optional(impl::aten::buildATen(bias)); - grad_input_mask.at(2) = true; - } + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atBias, bias); + auto grad_input_mask = std::array{true, atWeight.has_value(), atBias.has_value()}; auto atSaveMean = impl::aten::buildATen(mean); diopiGetTensorDtype(mean, &mDtype); @@ -3395,10 +3383,10 @@ diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveAvgPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOut = impl::aten::buildATen(out); - at::adaptive_avg_pool3d_out(atOut, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool3d_out, atOut, atInput, atOutSize); return diopiSuccess; } @@ -3409,14 +3397,14 @@ diopiError_t diopiAdaptiveAvgPool3dBackward(diopiContextHandle_t ctx, diopiTenso auto atGradOutput = impl::aten::buildATen(grad_output); auto atInput = impl::aten::buildATen(input); auto atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_avg_pool3d_backward_out(atGradInput, atGradOutput, atInput); + CALL_ATEN_CUDA_FUNC(adaptive_avg_pool3d_backward_out, atGradInput, atGradOutput, atInput); return diopiSuccess; } diopiError_t diopiAdaptiveMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOuts = at::adaptive_max_pool3d(atInput, atOutSize); impl::aten::updateATen2Tensor(ctx, std::get<0>(atOuts), out); @@ -3427,11 +3415,11 @@ diopiError_t diopiAdaptiveMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_ diopiError_t diopiAdaptiveMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t output_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atOutSize = impl::aten::buildAtIntArray(output_size); auto atOut = impl::aten::buildATen(out); auto atIndices = impl::aten::buildATen(indices); - at::adaptive_max_pool3d_out(atOut, atIndices, atInput, atOutSize); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool3d_out, atOut, atIndices, atInput, atOutSize); return diopiSuccess; } @@ -3439,11 +3427,11 @@ diopiError_t diopiAdaptiveMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTe diopiError_t diopiAdaptiveMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atGradOutput = impl::aten::buildATen(grad_output); - at::Tensor atIndices = impl::aten::buildATen(indices); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); - at::adaptive_max_pool3d_backward_out(atGradInput, atGradOutput, atInput, atIndices); + auto atInput = impl::aten::buildATen(input); + auto atGradOutput = impl::aten::buildATen(grad_output); + auto atIndices = impl::aten::buildATen(indices); + auto atGradInput = impl::aten::buildATen(grad_input); + CALL_ATEN_CUDA_FUNC(adaptive_max_pool3d_backward_out, atGradInput, atGradOutput, atInput, atIndices); return diopiSuccess; } @@ -3451,13 +3439,14 @@ diopiError_t diopiAdaptiveMaxPool3dBackward(diopiContextHandle_t ctx, diopiTenso diopiError_t diopiMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); bool atCeilMode = ceil_mode; - impl::aten::invokeATenFuncRet(ctx, at::max_pool3d, out, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + auto atOut = CALL_ATEN_FUNC(max_pool3d, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } @@ -3465,15 +3454,15 @@ diopiError_t diopiMaxPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d diopiError_t diopiMaxPool3dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); bool atCeilMode = ceil_mode; - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atIndices = impl::aten::buildATen(indices); - at::max_pool3d_with_indices_out(atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); + auto atOut = impl::aten::buildATen(out); + auto atIndices = impl::aten::buildATen(indices); + CALL_ATEN_CUDA_FUNC(max_pool3d_with_indices_out, atOut, atIndices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode); return diopiSuccess; } @@ -3490,24 +3479,26 @@ diopiError_t diopiMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_ at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); auto atIndices = impl::aten::buildATen(indices); auto atGradInput = impl::aten::buildATen(grad_input); - at::max_pool3d_with_indices_backward_out(atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); + CALL_ATEN_CUDA_FUNC( + max_pool3d_with_indices_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, atDilation, ceil_mode, atIndices); return diopiSuccess; } diopiError_t diopiPermute(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dims) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); auto atDims = impl::aten::buildAtIntArray(dims); - impl::aten::invokeATenFuncRet(ctx, at::permute, out, atInput, atDims); + auto atOut = CALL_ATEN_FUNC(permute, atInput, atDims); + impl::aten::updateATen2Tensor(ctx, atOut, out); return diopiSuccess; } diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src, diopiTensorHandle_t dest) { impl::aten::setCurStream(ctx); - at::Tensor atDest = impl::aten::buildATen(dest); - at::Tensor atSrc = impl::aten::buildATen(src); + auto atDest = impl::aten::buildATen(dest); + auto atSrc = impl::aten::buildATen(src); // Set non_blocking true to avoid stream sync thus improving performance. // The data is not ready when diopiCopyInp returns. // If you need to use it immediately, please call cudaStreamSynchronize first. @@ -3521,7 +3512,7 @@ diopiError_t diopiGather(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); - at::gather_out(atOut, atInput, dim, atIndex); + CALL_ATEN_CUDA_FUNC(gather_out, atOut, atInput, dim, atIndex); return diopiSuccess; } @@ -3533,7 +3524,7 @@ diopiError_t diopiGatherBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g auto atInput = impl::aten::buildATen(input); auto atIndex = impl::aten::buildATen(index); bool sparse_grad = false; - auto atOut = at::gather_backward(atGradOutput, atInput, dim, atIndex, sparse_grad); + auto atOut = CALL_ATEN_FUNC(gather_backward, atGradOutput, atInput, dim, atIndex, sparse_grad); impl::aten::updateATen2Tensor(ctx, atOut, grad_input); return diopiSuccess; @@ -3544,7 +3535,7 @@ diopiError_t diopiRemainderTensor(diopiContextHandle_t ctx, diopiTensorHandle_t auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, atInput, atOther); return diopiSuccess; } @@ -3554,7 +3545,7 @@ diopiError_t diopiRemainderScalar(diopiContextHandle_t ctx, diopiTensorHandle_t auto atInput = impl::aten::buildATen(input); auto atOther = impl::aten::buildAtScalar(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInput, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, atInput, c10::scalar_to_tensor(atOther)); return diopiSuccess; } @@ -3564,7 +3555,7 @@ diopiError_t diopiRemainder(diopiContextHandle_t ctx, diopiTensorHandle_t out, c auto atInputScalar = impl::aten::buildAtScalar(input); auto atOther = impl::aten::buildATen(other); auto atOut = impl::aten::buildATen(out); - at::remainder_out(atOut, atInputScalar, atOther); + CALL_ATEN_CUDA_FUNC(remainder_out, atOut, c10::scalar_to_tensor(atInputScalar), atOther); return diopiSuccess; } @@ -3581,7 +3572,10 @@ diopiError_t diopiCTCLoss(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atNegLogLikelihood = impl::aten::buildATen(neg_log_likelihood); auto atLogAlpha = impl::aten::buildATen(log_alpha); - at::_ctc_loss_out(atNegLogLikelihood, atLogAlpha, atLogProbs, atTarget, il, tl, blank, zero_infinity); + auto tempOut = CALL_ATEN_CUDA_FUNC(_ctc_loss, atLogProbs, atTarget, il, tl, blank, zero_infinity); + + at::native::copy_(atNegLogLikelihood, std::get<0>(tempOut), true); + at::native::copy_(atLogAlpha, std::get<1>(tempOut), true); auto atRes = atNegLogLikelihood; if (zero_infinity) { atRes = at::where(atRes == at::Scalar(std::numeric_limits::infinity()), at::zeros({}, atRes.options()), atRes); @@ -3623,8 +3617,8 @@ diopiError_t diopiCTCLossBackward(diopiContextHandle_t ctx, diopiTensorHandle_t auto atNegLogLikehood = impl::aten::buildATen(neg_log_likelihood); auto atLogAlpha = impl::aten::buildATen(log_alpha); auto atGradInput = impl::aten::buildATen(grad_input); - at::_ctc_loss_backward_out(atGradInput, atGrad, atLogProbs, atTarget, il, tl, atNegLogLikehood, atLogAlpha, blank, zero_infinity); - + auto tempOut = CALL_ATEN_CUDA_FUNC(_ctc_loss_backward, atGrad, atLogProbs, atTarget, il, tl, atNegLogLikehood, atLogAlpha, blank, zero_infinity); + at::native::copy_(atGradInput, tempOut, true); return diopiSuccess; } @@ -3634,13 +3628,8 @@ diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu DIOPI_CHECK_PTR(indices); auto atInput = impl::aten::buildATen(input); auto atValues = impl::aten::buildATen(values); - torch::List> atIndicesList; - assert(indices_counts >= 1); - for (int i = 0; i < indices_counts; ++i) { - auto atIndices = c10::optional(impl::aten::buildATen(indices[i])); - atIndicesList.emplace_back(atIndices); - } - atInput.index_put_(atIndicesList, atValues, accumulate); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL_LIST(atIndicesList, indices, indices_counts); + CALL_ATEN_CUDA_FUNC(_index_put_impl_, atInput, atIndicesList, atValues, accumulate); return diopiSuccess; } @@ -3652,13 +3641,8 @@ DIOPI_API diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle auto atInput = impl::aten::buildATen(input); auto atValues = impl::aten::buildATen(values); auto atOut = impl::aten::buildATen(out); - torch::List> atIndicesList; - assert(indices_counts >= 1); - for (int i = 0; i < indices_counts; ++i) { - auto atIndices = c10::optional(impl::aten::buildATen(indices[i])); - atIndicesList.emplace_back(atIndices); - } - at::index_put_out(atOut, atInput, atIndicesList, atValues, accumulate); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL_LIST(atIndicesList, indices, indices_counts); + CALL_ATEN_CUDA_FUNC(_index_put_impl_, atOut, atIndicesList, atValues, accumulate); return diopiSuccess; } @@ -3672,12 +3656,11 @@ diopiError_t diopiScatterInp(diopiContextHandle_t ctx, diopiTensorHandle_t input if (atIndex.dim() == 0) { return diopiSuccess; } - at::Tensor atOut; if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - atInput.scatter_(dim, atIndex, atSrc, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atSrc, atReduce); } else { - atInput.scatter_(dim, atIndex, atSrc); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atSrc); } return diopiSuccess; @@ -3692,12 +3675,11 @@ diopiError_t diopiScatterInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t if (atIndex.dim() == 0) { return diopiSuccess; } - at::Tensor atOut; if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - atInput.scatter_(dim, atIndex, atValue, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atValue, atReduce); } else { - atInput.scatter_(dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(scatter_, atInput, dim, atIndex, atValue); } return diopiSuccess; @@ -3711,14 +3693,14 @@ diopiError_t diopiScatter(diopiContextHandle_t ctx, diopiTensorHandle_t out, dio auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); if (atIndex.dim() == 0) { - atOut.copy_(atInput); + at::native::copy_(atOut, atInput, true); return diopiSuccess; } if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - at::scatter_out(atOut, atInput, dim, atIndex, atSrc, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atSrc, atReduce); } else { - at::scatter_out(atOut, atInput, dim, atIndex, atSrc); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atSrc); } return diopiSuccess; @@ -3732,14 +3714,14 @@ diopiError_t diopiScatterScalar(diopiContextHandle_t ctx, diopiTensorHandle_t ou auto atIndex = impl::aten::buildATen(index); auto atOut = impl::aten::buildATen(out); if (atIndex.dim() == 0) { - atOut.copy_(atInput); + at::native::copy_(atOut, atInput, true); return diopiSuccess; } if (0 == strcmp(reduce, "add") || 0 == strcmp(reduce, "multiply")) { c10::string_view atReduce(reduce, strlen(reduce)); - at::scatter_out(atOut, atInput, dim, atIndex, atValue, atReduce); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atValue, atReduce); } else { - at::scatter_out(atOut, atInput, dim, atIndex, atValue); + CALL_ATEN_CUDA_FUNC(scatter_out, atOut, atInput, dim, atIndex, atValue); } return diopiSuccess; @@ -3747,15 +3729,15 @@ diopiError_t diopiScatterScalar(diopiContextHandle_t ctx, diopiTensorHandle_t ou diopiError_t diopiUpsampleNearest(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); at::IntArrayRef atSize = impl::aten::buildAtIntArray(size); if (atInput.dim() == 3) { - at::upsample_nearest1d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest1d_out, atOut, atInput, atSize); } else if (atInput.dim() == 4) { - at::upsample_nearest2d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest2d_out, atOut, atInput, atSize); } else if (atInput.dim() == 5) { - at::upsample_nearest3d_out(atOut, atInput, atSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest3d_out, atOut, atInput, atSize); } else { NOT_SUPPORTED("input dim < 3 or >5"); return diopiErrorOccurred; @@ -3767,16 +3749,16 @@ diopiError_t diopiUpsampleNearest(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiUpsampleNearestBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t out_size, diopiSize_t in_size) { impl::aten::setCurStream(ctx); - at::Tensor atGradOut = impl::aten::buildATen(grad_output); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); + auto atGradOut = impl::aten::buildATen(grad_output); + auto atGradInput = impl::aten::buildATen(grad_input); at::IntArrayRef atOutSize = impl::aten::buildAtIntArray(out_size); at::IntArrayRef atInSize = impl::aten::buildAtIntArray(in_size); if (atGradInput.dim() == 3) { - at::upsample_nearest1d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest1d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else if (atGradInput.dim() == 4) { - at::upsample_nearest2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else if (atGradInput.dim() == 5) { - at::upsample_nearest3d_backward_out(atGradInput, atGradOut, atOutSize, atInSize); + CALL_ATEN_CUDA_FUNC(upsample_nearest3d_backward_out, atGradInput, atGradOut, atOutSize, atInSize); } else { NOT_SUPPORTED("grad_input dim < 3 or >5"); return diopiErrorOccurred; @@ -3788,22 +3770,22 @@ diopiError_t diopiUpsampleNearestBackward(diopiContextHandle_t ctx, diopiTensorH diopiError_t diopiUpsampleLinear(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t size, bool align_corners, const char* mode) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); at::IntArrayRef atSize = impl::aten::buildAtIntArray(size); if (3 == atInput.dim() && 0 == strcmp(mode, "linear")) { - at::upsample_linear1d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_linear1d_out, atOut, atInput, atSize, align_corners); } else if (4 == atInput.dim()) { if (0 == strcmp(mode, "bilinear")) { - at::upsample_bilinear2d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bilinear2d_out, atOut, atInput, atSize, align_corners); } else if (0 == strcmp(mode, "bicubic")) { - at::upsample_bicubic2d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bicubic2d_out, atOut, atInput, atSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; } } else if (5 == atInput.dim() && 0 == strcmp(mode, "trilinear")) { - at::upsample_trilinear3d_out(atOut, atInput, atSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_trilinear3d_out, atOut, atInput, atSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; @@ -3815,23 +3797,23 @@ diopiError_t diopiUpsampleLinear(diopiContextHandle_t ctx, diopiTensorHandle_t o diopiError_t diopiUpsampleLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiSize_t out_size, diopiSize_t in_size, bool align_corners, const char* mode) { impl::aten::setCurStream(ctx); - at::Tensor atGradOut = impl::aten::buildATen(grad_output); - at::Tensor atGradInput = impl::aten::buildATen(grad_input); + auto atGradOut = impl::aten::buildATen(grad_output); + auto atGradInput = impl::aten::buildATen(grad_input); at::IntArrayRef atOutSize = impl::aten::buildAtIntArray(out_size); at::IntArrayRef atInSize = impl::aten::buildAtIntArray(in_size); if (3 == atGradInput.dim() && 0 == strcmp(mode, "linear")) { - at::upsample_linear1d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_linear1d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else if (4 == atGradInput.dim()) { if (0 == strcmp(mode, "bilinear")) { - at::upsample_bilinear2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bilinear2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else if (0 == strcmp(mode, "bicubic")) { - at::upsample_bicubic2d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_bicubic2d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; } } else if (5 == atGradInput.dim() && 0 == strcmp(mode, "trilinear")) { - at::upsample_trilinear3d_backward_out(atGradInput, atGradOut, atOutSize, atInSize, align_corners); + CALL_ATEN_CUDA_FUNC(upsample_trilinear3d_backward_out, atGradInput, atGradOut, atOutSize, atInSize, align_corners); } else { NOT_SUPPORTED("interpolate mode type"); return diopiErrorOccurred; @@ -3903,7 +3885,7 @@ diopiError_t diopiProd(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC if (atInput.dim() == atOut.dim()) { keepdim = true; } - at::prod_out(atOut, atInput, *dim, keepdim); + CALL_ATEN_CUDA_FUNC(prod_out, atOut, atInput, *dim, keepdim); } return diopiSuccess; @@ -3918,7 +3900,7 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g if (grad_input) { auto atGradInput = impl::aten::buildATen(grad_input); - at::matmul_out(atGradInput, atGradOutput, atWeight); + CALL_ATEN_FUNC(matmul_out, atGradInput, atGradOutput, atWeight); } int64_t dims = atInput.dim(); @@ -3931,7 +3913,7 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g sumDim.push_back(i); } auto atGradWeight = impl::aten::buildATen(grad_weight); - at::sum_out(atGradWeight, atGradWeightTemp, sumDim); + CALL_ATEN_CUDA_FUNC(sum_out, atGradWeight, atGradWeightTemp, sumDim); } else { impl::aten::updateATen2Tensor(ctx, atGradWeightTemp, grad_weight); } @@ -3943,7 +3925,7 @@ diopiError_t diopiLinearBackward(diopiContextHandle_t ctx, diopiTensorHandle_t g sumDim.push_back(i); } auto atGradBias = impl::aten::buildATen(grad_bias); - at::sum_out(atGradBias, atGradOutput, sumDim); + CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atGradOutput, sumDim); } return diopiSuccess; @@ -3977,17 +3959,17 @@ diopiError_t diopiCrossEntropyLossBackward(diopiContextHandle_t ctx, diopiTensor diopiError_t diopiErfinv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); - at::erfinv_out(atOut, atInput); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + CALL_ATEN_CUDA_FUNC(erfinv_out, atOut, atInput); return diopiSuccess; } diopiError_t diopiErfinvInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::erfinv_out(atInput, atInput); + auto atInput = impl::aten::buildATen(input); + CALL_ATEN_CUDA_FUNC(erfinv_out, atInput, atInput); return diopiSuccess; } @@ -3995,14 +3977,14 @@ diopiError_t diopiErfinvInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) diopiError_t diopiIm2Col(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t dilation, diopiSize_t padding, diopiSize_t stride) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); - at::im2col_out(atOut, atInput, atKernelSize, atDilation, atPadding, atStride); + CALL_ATEN_CUDA_FUNC(im2col_out, atOut, atInput, atKernelSize, atDilation, atPadding, atStride); return diopiSuccess; } @@ -4010,25 +3992,26 @@ diopiError_t diopiIm2Col(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop diopiError_t diopiCol2Im(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size, diopiSize_t kernel_size, diopiSize_t dilation, diopiSize_t padding, diopiSize_t stride) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); at::IntArrayRef atOutSize = impl::aten::buildAtIntArray(output_size); at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size); at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation); at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding); at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride); - at::col2im_out(atOut, atInput, atOutSize, atKernelSize, atDilation, atPadding, atStride); + CALL_ATEN_CUDA_FUNC(col2im_out, atOut, atInput, atOutSize, atKernelSize, atDilation, atPadding, atStride); return diopiSuccess; } diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dims) { impl::aten::setCurStream(ctx); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); at::IntArrayRef atDims = impl::aten::buildAtIntArray(dims); - at::flip_out(atOut, atInput, atDims); + auto tempOut = CALL_ATEN_CUDA_FUNC(flip, atInput, atDims); + at::native::copy_(atOut, tempOut, true); return diopiSuccess; } @@ -4036,10 +4019,10 @@ diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC diopiError_t diopiCholesky(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t info, diopiConstTensorHandle_t mat, bool upper, bool checkerror) { impl::aten::setCurStream(ctx); - at::Tensor atMat = impl::aten::buildATen(mat); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atInfo = impl::aten::buildATen(info); - at::linalg_cholesky_ex_out(atOut, atInfo, atMat, upper, checkerror); + auto atMat = impl::aten::buildATen(mat); + auto atOut = impl::aten::buildATen(out); + auto atInfo = impl::aten::buildATen(info); + CALL_ATEN_CUDA_FUNC(linalg_cholesky_ex_out, atOut, atInfo, atMat, upper, checkerror); return diopiSuccess; } @@ -4047,8 +4030,8 @@ diopiError_t diopiCholesky(diopiContextHandle_t ctx, diopiTensorHandle_t out, di diopiError_t diopiCholeskyBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_mat, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t L, bool upper) { impl::aten::setCurStream(ctx); - at::Tensor atL = impl::aten::buildATen(L); - at::Tensor atGradOut = impl::aten::buildATen(grad_output); + auto atL = impl::aten::buildATen(L); + auto atGradOut = impl::aten::buildATen(grad_output); if (upper) { atL = atL.transpose(-1, -2).conj(); atGradOut = atGradOut.transpose(-1, -2).conj(); @@ -4067,11 +4050,11 @@ diopiError_t diopiCholeskyBackward(diopiContextHandle_t ctx, diopiTensorHandle_t diopiError_t diopiTriangularSolve(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t cloned_mat, diopiConstTensorHandle_t b, diopiConstTensorHandle_t mat, bool upper, bool transpose, bool unitriangular) { impl::aten::setCurStream(ctx); - at::Tensor atClonedMat = impl::aten::buildATen(cloned_mat); - at::Tensor atOut = impl::aten::buildATen(out); - at::Tensor atb = impl::aten::buildATen(b); - at::Tensor atMat = impl::aten::buildATen(mat); - at::triangular_solve_out(atOut, atClonedMat, atb, atMat, upper, transpose, unitriangular); + auto atClonedMat = impl::aten::buildATen(cloned_mat); + auto atOut = impl::aten::buildATen(out); + auto atb = impl::aten::buildATen(b); + auto atMat = impl::aten::buildATen(mat); + CALL_ATEN_CUDA_FUNC(triangular_solve_out, atOut, atClonedMat, atb, atMat, upper, transpose, unitriangular); return diopiSuccess; } @@ -4080,15 +4063,15 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di diopiConstTensorHandle_t grad_x, diopiConstTensorHandle_t grad_cloned_mat, diopiConstTensorHandle_t x, diopiConstTensorHandle_t b, diopiConstTensorHandle_t mat, bool upper, bool transpose, bool unitriangular) { impl::aten::setCurStream(ctx); - at::Tensor atGradB = impl::aten::buildATen(grad_b); - at::Tensor atGradM = impl::aten::buildATen(grad_mat); + auto atGradB = impl::aten::buildATen(grad_b); + auto atGradM = impl::aten::buildATen(grad_mat); - at::Tensor atGradx = impl::aten::buildATen(grad_x); - at::Tensor atGradCloneMat = impl::aten::buildATen(grad_cloned_mat); + auto atGradx = impl::aten::buildATen(grad_x); + auto atGradCloneMat = impl::aten::buildATen(grad_cloned_mat); - at::Tensor atx = impl::aten::buildATen(x); - at::Tensor atb = impl::aten::buildATen(b); - at::Tensor atMat = impl::aten::buildATen(mat); + auto atx = impl::aten::buildATen(x); + auto atb = impl::aten::buildATen(b); + auto atMat = impl::aten::buildATen(mat); at::Tensor atGradb, atGradMat; if (atGradx.defined() || atGradCloneMat.defined()) { @@ -4116,7 +4099,7 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di std::vector newShape{nums, atGradMat.size(-2), -1}; if (nums != 1) { at::IntArrayRef atShape(newShape.data(), newShape.size()); - at::sum_out(atGradM, atGradMat.reshape(atShape), 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradM, atGradMat.reshape(atShape), 0, false); } else { impl::aten::updateATen2Tensor(ctx, atGradMat, grad_mat); } @@ -4125,7 +4108,7 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di if (nums != 1) { newShape[0] = nums; at::IntArrayRef atShape(newShape.data(), newShape.size()); - at::sum_out(atGradB, atGradb.reshape(atShape), 0, false); + CALL_ATEN_CUDA_FUNC(sum_out, atGradB, atGradb.reshape(atShape), 0, false); } else { impl::aten::updateATen2Tensor(ctx, atGradb, grad_b); } @@ -4136,10 +4119,11 @@ DIOPI_API diopiError_t diopiTriangularSolveBackward(diopiContextHandle_t ctx, di diopiError_t diopiRepeat(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t repeats_size) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::Tensor atOut = impl::aten::buildATen(out); + auto atInput = impl::aten::buildATen(input); + auto atOut = impl::aten::buildATen(out); at::IntArrayRef atRepeatsSize = impl::aten::buildAtIntArray(repeats_size); - at::repeat_out(atOut, atInput, atRepeatsSize); + // not supported cuda dispatch yet, will supported in subsequent release. + CALL_ATEN_FUNC(repeat_out, atOut, atInput, atRepeatsSize); return diopiSuccess; } @@ -4150,7 +4134,7 @@ diopiError_t diopiMultinomial(diopiContextHandle_t ctx, diopiTensorHandle_t out, auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); at::Generator gen = impl::aten::buildGenerator(ctx, generator); - at::multinomial_out(atOut, atInput, num_samples, replacement, gen); + CALL_ATEN_CUDA_FUNC(multinomial_out, atOut, atInput, num_samples, replacement, gen); impl::aten::updateGeneratorHandleState(ctx, gen, generator); return diopiSuccess; @@ -4172,14 +4156,14 @@ diopiError_t diopiPolar(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopi auto atOut = impl::aten::buildATen(out); auto atAbs = impl::aten::buildATen(abs); auto atAngle = impl::aten::buildATen(angle); - at::polar_out(atOut, atAbs, atAngle); + CALL_ATEN_CUDA_FUNC(polar_out, atOut, atAbs, atAngle); return diopiSuccess; } DIOPI_API diopiError_t diopiCeilInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::ceil_(atInput); return diopiSuccess; @@ -4189,14 +4173,14 @@ DIOPI_API diopiError_t diopiCeil(diopiContextHandle_t ctx, diopiTensorHandle_t o impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::ceil_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(ceil_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiAsinInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); + auto atInput = impl::aten::buildATen(input); at::asin_(atInput); return diopiSuccess; @@ -4206,7 +4190,7 @@ DIOPI_API diopiError_t diopiAsin(diopiContextHandle_t ctx, diopiTensorHandle_t o impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::asin_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(asin_out, atOut, atInput); return diopiSuccess; } @@ -4218,7 +4202,7 @@ DIOPI_API diopiError_t diopiLerpTensor(diopiContextHandle_t ctx, diopiTensorHand auto atInput = impl::aten::buildATen(input); auto atEnd = impl::aten::buildATen(end); auto atWeight = impl::aten::buildATen(weight); - at::lerp_out(atOut, atInput, atEnd, atWeight); + CALL_ATEN_CUDA_FUNC(lerp_out, atOut, atInput, atEnd, atWeight); return diopiSuccess; } @@ -4229,8 +4213,8 @@ DIOPI_API diopiError_t diopiLerpScalar(diopiContextHandle_t ctx, diopiTensorHand auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); auto atEnd = impl::aten::buildATen(end); - at::Scalar atWeight = impl::aten::buildAtScalar(weight); - at::lerp_out(atOut, atInput, atEnd, atWeight); + auto atWeight = impl::aten::buildAtScalar(weight); + CALL_ATEN_CUDA_FUNC(lerp_out, atOut, atInput, atEnd, atWeight); return diopiSuccess; } @@ -4239,7 +4223,7 @@ DIOPI_API diopiError_t diopiTriu(diopiContextHandle_t ctx, diopiTensorHandle_t o impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::triu_out(atOut, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(triu_out, atOut, atInput, diagonal); return diopiSuccess; } @@ -4247,7 +4231,7 @@ DIOPI_API diopiError_t diopiTriu(diopiContextHandle_t ctx, diopiTensorHandle_t o DIOPI_API diopiError_t diopiTriuInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t diagonal) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); - at::triu_out(atInput, atInput, diagonal); + CALL_ATEN_CUDA_FUNC(triu_out, atInput, atInput, diagonal); return diopiSuccess; } @@ -4256,15 +4240,15 @@ DIOPI_API diopiError_t diopiSgn(diopiContextHandle_t ctx, diopiTensorHandle_t ou impl::aten::setCurStream(ctx); auto atOut = impl::aten::buildATen(out); auto atInput = impl::aten::buildATen(input); - at::sgn_out(atOut, atInput); + CALL_ATEN_CUDA_FUNC(sgn_out, atOut, atInput); return diopiSuccess; } DIOPI_API diopiError_t diopiSgnInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) { impl::aten::setCurStream(ctx); - at::Tensor atInput = impl::aten::buildATen(input); - at::sgn_out(atInput, atInput); + auto atInput = impl::aten::buildATen(input); + CALL_ATEN_CUDA_FUNC(sgn_out, atInput, atInput); return diopiSuccess; } @@ -4273,7 +4257,8 @@ DIOPI_API diopiError_t diopiIsNan(diopiContextHandle_t ctx, diopiTensorHandle_t impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); auto atOut = impl::aten::buildATen(out); - at::isnan_out(atOut, atInput); + auto tempOut = CALL_ATEN_CUDA_FUNC(isnan, atInput); + at::native::copy_(atOut, tempOut, true); return diopiSuccess; } @@ -4284,7 +4269,7 @@ DIOPI_API diopiError_t diopiLinalgQR(diopiContextHandle_t ctx, diopiConstTensorH auto atQ = impl::aten::buildATen(Q); auto atR = impl::aten::buildATen(R); c10::string_view atMode(mode, strlen(mode)); - at::linalg_qr_out(atQ, atR, atA, mode); + CALL_ATEN_CUDA_FUNC(linalg_qr_out, atQ, atR, atA, mode); return diopiSuccess; } @@ -4294,7 +4279,7 @@ diopiError_t diopiAmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC at::IntArrayRef atDim = impl::aten::buildAtIntArray(dim); auto atOut = impl::aten::buildATen(out); auto atSelf = impl::aten::buildATen(self); - at::amax_out(atOut, atSelf, atDim, keepdim); + CALL_ATEN_CUDA_FUNC(amax_out, atOut, atSelf, atDim, keepdim); return diopiSuccess; } @@ -4307,7 +4292,10 @@ diopiError_t diopiBatchNormStats(diopiContextHandle_t ctx, diopiTensorHandle_t m if (atInput.scalar_type() == at::kHalf) { DIOPI_CHECK(atMean.scalar_type() == at::kFloat && atInvstd.scalar_type() == at::kFloat, "out dtype should follow the accumulated dtype in CUDA."); } - at::batch_norm_stats_out(atMean, atInvstd, atInput, eps); + // not supported cuda yet, will supported in subsequent release. + auto tempOut = CALL_ATEN_CUDA_FUNC(batch_norm_stats, atInput, eps); + at::native::copy_(atMean, std::get<0>(tempOut), true); + at::native::copy_(atInvstd, std::get<1>(tempOut), true); return diopiSuccess; } @@ -4325,7 +4313,10 @@ DIOPI_API diopiError_t diopiBatchNormGatherStatsWithCounts(diopiContextHandle_t auto atCounts = impl::aten::buildATen(counts); auto atMean = impl::aten::buildATen(mean); auto atInvstd = impl::aten::buildATen(invstd); - at::batch_norm_gather_stats_with_counts_out(atMean, atInvstd, atInput, atMean_all, atInvstd_all, atRunning_mean, atRunning_var, momentum, eps, atCounts); + auto tempOut = + CALL_ATEN_CUDA_FUNC(batch_norm_gather_stats_with_counts, atInput, atMean_all, atInvstd_all, atRunning_mean, atRunning_var, momentum, eps, atCounts); + at::native::copy_(atMean, std::get<0>(tempOut), true); + at::native::copy_(atInvstd, std::get<1>(tempOut), true); return diopiSuccess; } @@ -4372,7 +4363,8 @@ DIOPI_API diopiError_t diopiBatchNormBackwardElemt(diopiContextHandle_t ctx, dio auto atSumDyXmu = impl::aten::buildATen(sum_dy_xmu); auto atCount = impl::aten::buildATen(count); auto atGradInput = impl::aten::buildATen(grad_input); - at::batch_norm_backward_elemt_out(atGradInput, atGradOut, atInput, atMean, atInvstd, atWeight, atSumDy, atSumDyXmu, atCount); + auto tempOut = CALL_ATEN_CUDA_FUNC(batch_norm_backward_elemt, atGradOut, atInput, atMean, atInvstd, atWeight, atSumDy, atSumDyXmu, atCount); + at::native::copy_(atGradInput, tempOut, true); return diopiSuccess; } @@ -4386,7 +4378,7 @@ DIOPI_API diopiError_t diopiBatchNormElemt(diopiContextHandle_t ctx, diopiTensor auto atWeight = impl::aten::buildATen(weight); auto atBias = impl::aten::buildATen(bias); auto atOut = impl::aten::buildATen(out); - at::batch_norm_elemt_out(atOut, atInput, atWeight, atBias, atMean, atInvstd, eps); + CALL_ATEN_CUDA_FUNC(batch_norm_elemt_out, atOut, atInput, atWeight, atBias, atMean, atInvstd, eps); return diopiSuccess; } diff --git a/impl/torch/functions/functions_ext.cpp b/impl/torch/functions/functions_ext.cpp index 1920b08ddf..11715d8e6a 100644 --- a/impl/torch/functions/functions_ext.cpp +++ b/impl/torch/functions/functions_ext.cpp @@ -108,7 +108,7 @@ diopiError_t diopiMultiHeadAttention(diopiContextHandle_t ctx, diopiTensorHandle auto atQ = impl::aten::buildATen(q); auto atK = impl::aten::buildATen(k); auto atV = impl::aten::buildATen(v); - c10::optional optOut(impl::aten::buildATen(out)); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(optOut, out); auto atGen = buildGeneratorForMha(ctx, gen, dropout_p); auto headSize = atQ.sizes()[3]; @@ -146,9 +146,9 @@ diopiError_t diopiMultiHeadAttentionBackward(diopiContextHandle_t ctx, diopiCons auto atGradOut = impl::aten::buildATen(grad_out); auto atOut = impl::aten::buildATen(out); auto atLogsumexp = impl::aten::buildATen(softmax_lse); - c10::optional optGradQ(impl::aten::buildATen(grad_q)); - c10::optional optGradK(impl::aten::buildATen(grad_k)); - c10::optional optGradV(impl::aten::buildATen(grad_v)); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(optGradQ, grad_q); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(optGradK, grad_k); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(optGradV, grad_v); c10::optional nullOpt; // Workaround: flash_attn uses non-const optional& as args (which is a really bad idea) std::vector result = DIOPI_EXT_CALL_FLASH( @@ -166,7 +166,7 @@ diopiError_t diopiMultiHeadAttentionVarLen(diopiContextHandle_t ctx, diopiTensor auto atQ = impl::aten::buildATen(q); auto atK = impl::aten::buildATen(k); auto atV = impl::aten::buildATen(v); - c10::optional optOut(impl::aten::buildATen(out)); + DIOPI_IMPL_BUILD_ATEN_OPTIONAL(optOut, out); auto atCumSeqQ = impl::aten::buildATen(cum_seq_q); auto atCumSeqK = impl::aten::buildATen(cum_seq_k); auto atGen = buildGeneratorForMha(ctx, gen, dropout_p); diff --git a/impl/torch/helper.cpp b/impl/torch/helper.cpp index 2b180c5188..e7c40b3c80 100644 --- a/impl/torch/helper.cpp +++ b/impl/torch/helper.cpp @@ -5,7 +5,10 @@ */ #include "helper.hpp" +#include #include +#include +#include namespace impl { @@ -77,101 +80,6 @@ diopiDtype_t getDIOPITensorType(const at::Tensor& input) { } } -namespace { - -template -class BuildATenDeviceImpl {}; - -template <> -class BuildATenDeviceImpl { -public: - static void lazyInitDevice() {} - static at::Device device(diopiConstTensorHandle_t /*unused*/) { return {at::DeviceType::CPU}; } - static at::Tensor empty(at::IntArrayRef size, at::ScalarType dtype, at::Device /*unused*/) { - return at::detail::empty_cpu(size, dtype, /*pin_memory=*/false, /*memory_format_opt=*/c10::nullopt); - } -}; - -template <> -class BuildATenDeviceImpl { -public: - static void lazyInitDevice() { at::globalContext().lazyInitCUDA(); } - static at::Device device(diopiConstTensorHandle_t tensor) { - diopiDeviceIndex_t deviceIndex; - diopiGetTensorDeviceIndex(tensor, &deviceIndex); - return {at::DeviceType::CUDA, deviceIndex}; - } - static at::Tensor empty(at::IntArrayRef size, at::ScalarType dtype, at::Device device) { - return at::detail::empty_cuda(size, dtype, device, /*memory_format_opt=*/c10::nullopt); - } -}; - -template -at::Tensor buildATenImpl(diopiConstTensorHandle_t tensor) { - diopiSize_t shape; - diopiGetTensorShape(tensor, &shape); - at::IntArrayRef atSizes(shape.data, shape.len); - - diopiDtype_t dtype; - diopiGetTensorDtype(tensor, &dtype); - auto atTypeMeta = getATenType(dtype); - auto atDtype = atTypeMeta.toScalarType(); - - auto atDevice = DeviceImpl::device(tensor); - - // NOTE: storage offset has been handled in `diopiGetTensorData` - void* data = nullptr; - diopiGetTensorData(const_cast(tensor), &data); - - if (data == nullptr) { - return DeviceImpl::empty(atSizes, atDtype, atDevice); - } - - // NOTE: CUDA allocators may have not been initialized if we were using DIPU allocators. - // We have to do this explicitly for potential allocations in op workspaces. - DeviceImpl::lazyInitDevice(); - - // PERF: It would be faster if we can obtain and reuse the storage from tensor. - // However we cannot assume diopiTensorHandle_t to be a wrapper of at::Tensor. - // So we have to create a new storage (offset = 0) whose data_ptr points to - // the same address but with an empty dtor (to avoid double-free). - - diopiSize_t stride; - diopiGetTensorStride(tensor, &stride); - at::IntArrayRef atStrides(stride.data, stride.len); - - auto storageNBytes = at::detail::computeStorageNbytes(atSizes, atStrides, atTypeMeta.itemsize()); - - // NOTE: in this way, data_ptr will have an empty destructor - at::Storage storage{at::Storage::use_byte_size_t{}, storageNBytes, /*data_ptr=*/{data, atDevice}}; - - auto dk = at::computeDispatchKey(atDtype, /*layout=*/c10::nullopt, atDevice); - at::Tensor atTensor = at::detail::make_tensor(std::move(storage), dk, atTypeMeta); - atTensor.unsafeGetTensorImpl()->set_sizes_and_strides(atSizes, atStrides); - - return atTensor; -} - -} // namespace - -at::Tensor buildATen(diopiConstTensorHandle_t tensor) { - if (tensor == nullptr) { - return at::Tensor(); - } - - diopiDevice_t device; - diopiGetTensorDevice(tensor, &device); - switch (device) { - case diopi_host: - return buildATenImpl>(tensor); - case diopi_device: - return buildATenImpl>(tensor); - default: - TORCH_CHECK(false, "Invalid device type encountered in buildATen: ", device); - return {}; - } -} - at::Scalar buildAtScalar(const diopiScalar_t* scalar) { if (scalar == nullptr) { NOT_SUPPORTED("scalar is null ptr, we use temporarily zero"); diff --git a/impl/torch/helper.hpp b/impl/torch/helper.hpp index 8e9875ea70..afdfd9f298 100644 --- a/impl/torch/helper.hpp +++ b/impl/torch/helper.hpp @@ -13,10 +13,10 @@ #include #include -#include #include #include +#include "build_aten.hpp" // IWYU pragma: export #include "error.hpp" #include "impl_functions.hpp" @@ -96,8 +96,6 @@ inline c10::DeviceType getATenDevice(diopiDevice_t device) { return c10::DeviceType::CUDA; } -at::Tensor buildATen(diopiConstTensorHandle_t tensor); - inline bool isInt(const diopiScalar_t* scalar) { return scalar->stype <= 7; } inline bool isFloat(const diopiScalar_t* scalar) { return scalar->stype > 7; } @@ -108,15 +106,6 @@ inline at::IntArrayRef buildAtIntArray(const diopiSize_t* size) { return at::Int inline at::IntArrayRef buildAtIntArray(diopiSize_t size) { return at::IntArrayRef(size.data, size.len); } -template -inline decltype(auto) buildATenList(T* tensors, int64_t numTensors) { - std::vector vecAtTensor; - for (size_t i = 0; i < numTensors; ++i) { - vecAtTensor.emplace_back(buildATen(tensors[i])); - } - return vecAtTensor; -} - inline void updateATen2Tensor(diopiContextHandle_t ctx, const at::Tensor& atOut, diopiTensorHandle_t out) { if (out != nullptr) { at::Tensor atOutput = buildATen(out).reshape_as(atOut); @@ -147,7 +136,7 @@ inline void updateATen2Tensor(diopiContextHandle_t ctx, TupleT& atOuts, diopi_te UpdateTupleATen::update(ctx, atOuts, outs); } -inline void updateATen2Tensor(diopiContextHandle_t ctx, std::vector& atOuts, diopi_tensor_list& outs) { +inline void updateATen2Tensor(diopiContextHandle_t ctx, c10::ArrayRef& atOuts, diopi_tensor_list& outs) { for (size_t i = 0; i < atOuts.size(); ++i) { updateATen2Tensor(ctx, atOuts.at(i), outs.at(i)); }