From e16a3aee1f0be2ab08c49bb693440168d36dcbc0 Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Thu, 25 Jul 2024 16:58:56 +0800 Subject: [PATCH 1/7] add foreach_mul foreach_norm, foreach_unscale in cuda --- impl/torch/functions/functions.cpp | 52 ++++++++++++++++++++++++++++++ proto/include/diopi/functions.h | 37 ++++++++++++++++++++- 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index 2a17e36424..3600608f1b 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -1165,6 +1165,15 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp return diopiSuccess; } +diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(_foreach_add_,atSelf,atOther); + + return diopiSuccess; +} + diopiError_t diopiSub(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, const diopiScalar_t* alpha) { impl::aten::setCurStream(ctx); @@ -1247,6 +1256,34 @@ diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp return diopiSuccess; } +diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + auto atOther = impl::aten::buildAtScalar(other); + CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther); + + return diopiSuccess; +} + +diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + auto atOther = impl::aten::buildATen(other); + CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther); + + return diopiSuccess; +} + +diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads) + auto atFoundInf = impl::aten::buildATen(found_inf); + auto atInvScale = impl::aten::buildATen(inv_scale); + CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale); + + return diopiSuccess; +} + diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); auto atInput = impl::aten::buildATen(input); @@ -3230,6 +3267,21 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC return diopiSuccess; } +diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p) { + DIOPI_CHECK_PTR(out); + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + auto atP = impl::aten::buildAtScalar(p); + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atSelf, atP); + for (int i=0; i(out[i])) = tempOut[i]; + } + + return diopiSuccess; +} + diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t save_mean, diopiTensorHandle_t save_invstd, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t num_groups, double eps) { impl::aten::setCurStream(ctx); diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h index 7c978453a8..4f3b3d1d1c 100644 --- a/proto/include/diopi/functions.h +++ b/proto/include/diopi/functions.h @@ -1106,6 +1106,14 @@ DIOPI_API diopiError_t diopiAddScalar(diopiContextHandle_t ctx, diopiTensorHandl */ DIOPI_API diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha); +/** + * @brief The in-place version of diopiForeachaddScalar. + * @param[in] ctx Context environment. + * @param[in] self the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] selfSize the length of the input tensor list. type = [int64]. + * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. + */ +DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); /** * @brief Perform subtraction operations between tensors. * @param[in] ctx Context environment. @@ -1180,6 +1188,24 @@ DIOPI_API diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandl */ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other); +/** + * @brief The in-place version of diopiForeachmulScalar. + * @param[in] ctx Context environment. + * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. + * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. + */ +DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); + +/** + * @brief The in-place version of diopiForeachmulScalar. + * @param[in] ctx Context environment. + * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. + * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. + */ +DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other); + /** * @brief Divides each element of input tensor by the corresponding element in other tensor. * @param[in] ctx Context environment. @@ -1188,7 +1214,8 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa * @param[in] rounding_mode Rounding mode applied to the result, None: no rounding is performed, if both input and other are integer types, * the inputs are promoted to the default scalar type; trunc: truncate towards zero; floor: round down towards negative infinity for the result of the division. * @param[out] out the output tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. - */ +*/ + DIOPI_API diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode); @@ -2888,6 +2915,14 @@ DIOPI_API diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t o */ DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* p, diopiSize_t dim); +/** + * @brief Returns the matrix norm or vector norm of a given tensor list. + * @param[in] ctx Context environment. + * @param[out] out the output tesnor list, type=[float32, float64, float16]. + * @param[in] input the input tesnor list, type=[float32, float64, float16]. + * @param[in] p an array, the order of norm. + */ +DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p); /** * \brief Applies Group Normalization over a mini-batch of inputs. * @param[in] ctx Context environment. From 5e822e40090609b4c4e71652a317c8ebb1555b59 Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Fri, 26 Jul 2024 18:07:44 +0800 Subject: [PATCH 2/7] add foreachadd for torch version 20000 --- impl/torch/functions/functions.cpp | 11 +++++++++++ proto/include/diopi/functions.h | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index 3600608f1b..f9402bbcc0 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -1165,6 +1165,17 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp return diopiSuccess; } +diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + for(int i=0; i Date: Mon, 29 Jul 2024 15:21:36 +0800 Subject: [PATCH 3/7] add foreach_mul.scalar foreach_mul.tensor for torch version 20000 --- impl/torch/functions/functions.cpp | 28 ++++++++++++++++++++++++++++ proto/include/diopi/functions.h | 19 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index f9402bbcc0..c9be0e568d 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -1272,6 +1272,20 @@ diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandl DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) auto atOther = impl::aten::buildAtScalar(other); CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther); + + return diopiSuccess; +} + +diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { + DIOPI_CHECK_PTR(out); + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + auto atOther = impl::aten::buildAtScalar(other); + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther); + for (int i=0; i(out[i])) = tempOut[i]; + } return diopiSuccess; } @@ -1285,6 +1299,20 @@ diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandl return diopiSuccess; } +diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) { + DIOPI_CHECK_PTR(out); + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + auto atOther = impl::aten::buildATen(other); + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther); + for (int i=0; i(out[i])) = tempOut[i]; + } + + return diopiSuccess; +} + diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) { impl::aten::setCurStream(ctx); DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads) diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h index e47c80aa26..e4ace539df 100644 --- a/proto/include/diopi/functions.h +++ b/proto/include/diopi/functions.h @@ -1198,6 +1198,16 @@ DIOPI_API diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandl */ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other); +/** + * @brief The diopiForeachmulScalar. + * @param[in] ctx Context environment. + * @param[out] out the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] input the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. + * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. + */ +DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); + /** * @brief The in-place version of diopiForeachmulScalar. * @param[in] ctx Context environment. @@ -1207,6 +1217,15 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa */ DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); +/* + * @brief The in-place version of diopiForeachmulScalar. + * @param[in] ctx Context environment. + * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. + * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. + */ +DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other); + /** * @brief The in-place version of diopiForeachmulScalar. * @param[in] ctx Context environment. From 57266f377747018c5c48a03885a5ce1836312d74 Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Tue, 30 Jul 2024 11:58:04 +0800 Subject: [PATCH 4/7] fix clang-format and improve code style. --- impl/torch/functions/functions.cpp | 97 ++++++++++++++++-------------- proto/include/diopi/functions.h | 50 ++++++++------- 2 files changed, 80 insertions(+), 67 deletions(-) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index c9be0e568d..cda0ed467d 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -1165,22 +1165,25 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp return diopiSuccess; } -diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { +diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) - for(int i=0; i(outs[i])) = tempOut[i]; + } return diopiSuccess; } -diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { +diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) auto atOther = impl::aten::buildAtScalar(other); - CALL_ATEN_CUDA_FUNC(_foreach_add_,atSelf,atOther); + CALL_ATEN_CUDA_FUNC(_foreach_add_, atInputs, atOther); return diopiSuccess; } @@ -1267,60 +1270,63 @@ diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp return diopiSuccess; } -diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { +diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) auto atOther = impl::aten::buildAtScalar(other); - CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther); - + CALL_ATEN_CUDA_FUNC(_foreach_mul_, atInputs, atOther); + return diopiSuccess; } -diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) { - DIOPI_CHECK_PTR(out); +diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* other) { + DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) auto atOther = impl::aten::buildAtScalar(other); - auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther); - for (int i=0; i(out[i])) = tempOut[i]; + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther); + for (int i = 0; i < inputSize; i++) { + *(reinterpret_cast(outs[i])) = tempOut[i]; } return diopiSuccess; } -diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) { +diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiConstTensorHandle_t other) { impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) auto atOther = impl::aten::buildATen(other); - CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther); + CALL_ATEN_CUDA_FUNC(_foreach_mul_, atInputs, atOther); return diopiSuccess; } -diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) { - DIOPI_CHECK_PTR(out); +diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiConstTensorHandle_t other) { + DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) auto atOther = impl::aten::buildATen(other); - auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther); - for (int i=0; i(out[i])) = tempOut[i]; + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther); + for (int i = 0; i < inputSize; i++) { + *(reinterpret_cast(outs[i])) = tempOut[i]; } return diopiSuccess; } -diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) { - impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads) - auto atFoundInf = impl::aten::buildATen(found_inf); - auto atInvScale = impl::aten::buildATen(inv_scale); - CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale); +diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, + diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) { + impl::aten::setCurStream(ctx); + DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads) + auto atFoundInf = impl::aten::buildATen(found_inf); + auto atInvScale = impl::aten::buildATen(inv_scale); + CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale); - return diopiSuccess; + return diopiSuccess; } diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) { @@ -3306,18 +3312,19 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC return diopiSuccess; } -diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p) { - DIOPI_CHECK_PTR(out); +diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* p) { + DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) + DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) auto atP = impl::aten::buildAtScalar(p); - auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atSelf, atP); - for (int i=0; i(out[i])) = tempOut[i]; + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP); + for (int i = 0; i < inputSize; i++) { + // impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]); + *(reinterpret_cast(outs[i])) = tempOut[i]; } - + return diopiSuccess; } diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h index e4ace539df..902a3fadba 100644 --- a/proto/include/diopi/functions.h +++ b/proto/include/diopi/functions.h @@ -1109,21 +1109,22 @@ DIOPI_API diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHa /** * @brief The diopiForeachaddScalar. * @param[in] ctx Context environment. - * @param[out] out the output tensor list and will be store the result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. - * @param[in] self the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. - * @param[in] selfSize the length of the input tensor list. type = [int64]. + * @param[out] outs the output tensor list and will be store the result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); +DIOPI_API diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* other); /** * @brief The in-place version of diopiForeachaddScalar. * @param[in] ctx Context environment. - * @param[in] self the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. - * @param[in] selfSize the length of the input tensor list. type = [int64]. + * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); +DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other); /** * @brief Perform subtraction operations between tensors. * @param[in] ctx Context environment. @@ -1201,39 +1202,42 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa /** * @brief The diopiForeachmulScalar. * @param[in] ctx Context environment. - * @param[out] out the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. - * @param[in] input the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[out] outs the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); +DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* other); /** * @brief The in-place version of diopiForeachmulScalar. * @param[in] ctx Context environment. - * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputs the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other); +DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other); /* - * @brief The in-place version of diopiForeachmulScalar. + * @brief The diopiForeachmulTensor * @param[in] ctx Context environment. - * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] puts the output tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other); +DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiConstTensorHandle_t other); /** - * @brief The in-place version of diopiForeachmulScalar. + * @brief The in-place version of diopiForeachmulTensor. * @param[in] ctx Context environment. - * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. + * @param[in] inputs the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. * @param[in] inputSize the length of the input tensor list. type = [int64]. * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8]. */ -DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other); +DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiConstTensorHandle_t other); /** * @brief Divides each element of input tensor by the corresponding element in other tensor. @@ -1243,7 +1247,7 @@ DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiT * @param[in] rounding_mode Rounding mode applied to the result, None: no rounding is performed, if both input and other are integer types, * the inputs are promoted to the default scalar type; trunc: truncate towards zero; floor: round down towards negative infinity for the result of the division. * @param[out] out the output tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool]. -*/ + */ DIOPI_API diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other, diopiRoundMode_t rounding_mode); @@ -2947,11 +2951,13 @@ DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t o /** * @brief Returns the matrix norm or vector norm of a given tensor list. * @param[in] ctx Context environment. - * @param[out] out the output tesnor list, type=[float32, float64, float16]. - * @param[in] input the input tesnor list, type=[float32, float64, float16]. + * @param[out] outs the output tesnor list, type=[float32, float64, float16]. + * @param[in] inputs the input tesnor list, type=[float32, float64, float16]. + * @param[in] inputSize the input size * @param[in] p an array, the order of norm. */ -DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p); +DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, + const diopiScalar_t* p); /** * \brief Applies Group Normalization over a mini-batch of inputs. * @param[in] ctx Context environment. From 981ea99e89e277503e240a33852a655fbe93f498 Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Wed, 31 Jul 2024 16:30:23 +0800 Subject: [PATCH 5/7] add tests in diopi_tests --- diopi_test/python/configs/diopi_configs.py | 21 +++++++++ .../configs/model_config/generate_config.py | 3 +- .../python/conformance/diopi_functions.py | 43 +++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/diopi_test/python/configs/diopi_configs.py b/diopi_test/python/configs/diopi_configs.py index 2e268dd2c0..864bed22b6 100755 --- a/diopi_test/python/configs/diopi_configs.py +++ b/diopi_test/python/configs/diopi_configs.py @@ -4058,6 +4058,27 @@ # ], # ), # ), + + 'foreach_op': dict( + name=["_foreach_mul","_foreach_add"], + interface=["torch"], + para=dict( + scalar=[1.0, 5, 2.0, -1.2, 3, 10, 8, -0.5, 0, -2], + ), + tensor_para=dict( + args=[ + { + "ins": ["self"], + "shape": ((), (10,), (10, 2, 5), (20,), (10, 5, 1), (20, 3, 4, 5), (20, 2, 3, 4, 5), + (0,), (0, 10), (5, 0, 9)), + "gen_fn": 'Genfunc.randn', + "dtype": [np.float32, np.float16, np.float64], + "gen_policy": 'gen_tensor_list', + "gen_num_range": [1, 5] + }, + ], + ), + ), 'tril': dict( name=["tril"], diff --git a/diopi_test/python/configs/model_config/generate_config.py b/diopi_test/python/configs/model_config/generate_config.py index eed4409d5a..cf03619421 100644 --- a/diopi_test/python/configs/model_config/generate_config.py +++ b/diopi_test/python/configs/model_config/generate_config.py @@ -49,7 +49,8 @@ 'bitwise_or', 'sigmoid', 'erf', 'matmul', 'addcmul', 'std', 'arange', 'log2', 'sign', 'eq', 'nonzero', 'triangular_solve', 'ne', 'mul', 'linspace', 'index_fill', 'atan', 'le', 'sgn', - 'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any'], + 'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any', + '_foreach_add','_foreach_mul'], 'torch.nn.functional': ['conv2d', 'batch_norm'], 'torch.Tensor': ['fill_', 'repeat', 'unfold', 'copy_', 'expand'], 'CustomizedTest': ['linalgqr', 'adadelta', 'cast_np', 'batch_norm_elemt', diff --git a/diopi_test/python/conformance/diopi_functions.py b/diopi_test/python/conformance/diopi_functions.py index 1ad454b53c..87dce0f26a 100644 --- a/diopi_test/python/conformance/diopi_functions.py +++ b/diopi_test/python/conformance/diopi_functions.py @@ -1643,6 +1643,49 @@ def clip_grad_norm_(tensors, max_norm, norm_type=2.0, error_if_nonfinite=False): return out.value +def _foreach_add(self, scalar): + ctx = self[0].context() + num_tensors = len(self) + func = check_function("diopiForeachaddScalar") + input_tensors = list([TensorP(input) for input in self]) + out_tensorV = list([Tensor(self[i].size(),self[i].get_dtype()) for i in range(num_tensors)]) + out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV]) + if isinstance(scalar, Tensor): + other = scalar + else: + other = Scalar(scalar) + ret = func( + ctx, + out_tensors, + input_tensors, + num_tensors, + other + ) + check_returncode(ret) + + return out_tensorV + +def _foreach_mul(self, scalar): + ctx = self[0].context() + num_tensors = len(self) + func = check_function("diopiForeachmulScalar") + input_tensors = list([TensorP(input) for input in self]) + out_tensorV = list([Tensor(self[i].size(),self[i].get_dtype()) for i in range(num_tensors)]) + out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV]) + if isinstance(scalar, Tensor): + other = scalar + else: + other = Scalar(scalar) + ret = func( + ctx, + out_tensors, + input_tensors, + num_tensors, + other + ) + check_returncode(ret) + + return out_tensorV def batch_norm( input, From 880ddf4f3b2b017c1641acefdb8a26b1feca32ac Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Wed, 31 Jul 2024 19:13:44 +0800 Subject: [PATCH 6/7] use updateATen2Tensor in functions.cpp(this will slow down the perf) --- impl/torch/functions/functions.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index cda0ed467d..afada5ceec 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -1168,12 +1168,11 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other) { impl::aten::setCurStream(ctx); - DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) auto atOther = impl::aten::buildAtScalar(other); auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_add, atInputs, atOther); for (int i = 0; i < inputSize; i++) { - *(reinterpret_cast(outs[i])) = tempOut[i]; + impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]); } return diopiSuccess; @@ -1284,11 +1283,10 @@ diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) auto atOther = impl::aten::buildAtScalar(other); auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther); for (int i = 0; i < inputSize; i++) { - *(reinterpret_cast(outs[i])) = tempOut[i]; + impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]); } return diopiSuccess; @@ -1312,7 +1310,7 @@ diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t auto atOther = impl::aten::buildATen(other); auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther); for (int i = 0; i < inputSize; i++) { - *(reinterpret_cast(outs[i])) = tempOut[i]; + impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]); } return diopiSuccess; @@ -3317,12 +3315,11 @@ diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_ DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) - DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize) auto atP = impl::aten::buildAtScalar(p); auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP); for (int i = 0; i < inputSize; i++) { - // impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]); - *(reinterpret_cast(outs[i])) = tempOut[i]; + //WARN NO NEED TO COPY HERE, WE NEED FASTER UPDATE HERE + impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]); } return diopiSuccess; From 009dfcfec6f72a6a0a9ec9b46fc9063c0d988a72 Mon Sep 17 00:00:00 2001 From: lhy <442488254@qq.com> Date: Wed, 31 Jul 2024 21:21:28 +0800 Subject: [PATCH 7/7] add tests for foreach_norm in diopi_tests --- diopi_test/python/configs/diopi_configs.py | 19 ++++++++++++++++++- .../configs/model_config/generate_config.py | 2 +- .../python/conformance/diopi_functions.py | 19 +++++++++++++++++++ impl/torch/functions/functions.cpp | 7 +++---- proto/include/diopi/functions.h | 2 +- 5 files changed, 42 insertions(+), 7 deletions(-) diff --git a/diopi_test/python/configs/diopi_configs.py b/diopi_test/python/configs/diopi_configs.py index 864bed22b6..9e8379d935 100755 --- a/diopi_test/python/configs/diopi_configs.py +++ b/diopi_test/python/configs/diopi_configs.py @@ -4059,7 +4059,7 @@ # ), # ), - 'foreach_op': dict( + 'pointwise_binary_foreach_op': dict( name=["_foreach_mul","_foreach_add"], interface=["torch"], para=dict( @@ -4079,6 +4079,23 @@ ], ), ), + + 'foreach_norm': dict( + name=['_foreach_norm'], + interface=['torch'], + tensor_para=dict( + args=[ + { + "ins": ["self"], + "shape": ((256, 512, 1, 1),(8, 1, 4),(256, 64, 1, 1),(10, 1, 4),(256, 128, 1, 1),(16, 1, 4),(256, 256, 1, 1),(3, 1, 4)), + "dtype": [np.float32, np.float64, np.float16], + "gen_fn": 'Genfunc.randn', + "gen_policy": 'gen_tensor_list', + "gen_num_range": [1, 5] + }, + ], + ), + ), 'tril': dict( name=["tril"], diff --git a/diopi_test/python/configs/model_config/generate_config.py b/diopi_test/python/configs/model_config/generate_config.py index cf03619421..e2030e8a4e 100644 --- a/diopi_test/python/configs/model_config/generate_config.py +++ b/diopi_test/python/configs/model_config/generate_config.py @@ -50,7 +50,7 @@ 'arange', 'log2', 'sign', 'eq', 'nonzero', 'triangular_solve', 'ne', 'mul', 'linspace', 'index_fill', 'atan', 'le', 'sgn', 'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any', - '_foreach_add','_foreach_mul'], + '_foreach_add', '_foreach_mul', '_foreach_norm'], 'torch.nn.functional': ['conv2d', 'batch_norm'], 'torch.Tensor': ['fill_', 'repeat', 'unfold', 'copy_', 'expand'], 'CustomizedTest': ['linalgqr', 'adadelta', 'cast_np', 'batch_norm_elemt', diff --git a/diopi_test/python/conformance/diopi_functions.py b/diopi_test/python/conformance/diopi_functions.py index 87dce0f26a..0e2c0d6ca1 100644 --- a/diopi_test/python/conformance/diopi_functions.py +++ b/diopi_test/python/conformance/diopi_functions.py @@ -1687,6 +1687,25 @@ def _foreach_mul(self, scalar): return out_tensorV +def _foreach_norm(self): + ctx = self[0].context() + num_tensors = len(self) + func = check_function("diopiForeachnormScalar") + input_tensors = list([TensorP(input) for input in self]) + out_tensorV = list([Tensor([],self[i].get_dtype()) for i in range(num_tensors)]) + out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV]) + other = Scalar(2) + ret = func( + ctx, + out_tensors, + input_tensors, + num_tensors, + other + ) + check_returncode(ret) + + return out_tensorV + def batch_norm( input, running_mean, diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp index afada5ceec..8c25376053 100644 --- a/impl/torch/functions/functions.cpp +++ b/impl/torch/functions/functions.cpp @@ -3311,14 +3311,13 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC } diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, - const diopiScalar_t* p) { + const diopiScalar_t* ord) { DIOPI_CHECK_PTR(outs); impl::aten::setCurStream(ctx); DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize) - auto atP = impl::aten::buildAtScalar(p); - auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP); + auto atOrd = impl::aten::buildAtScalar(ord); + auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atOrd); for (int i = 0; i < inputSize; i++) { - //WARN NO NEED TO COPY HERE, WE NEED FASTER UPDATE HERE impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]); } diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h index 902a3fadba..891f46f305 100644 --- a/proto/include/diopi/functions.h +++ b/proto/include/diopi/functions.h @@ -2957,7 +2957,7 @@ DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t o * @param[in] p an array, the order of norm. */ DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize, - const diopiScalar_t* p); + const diopiScalar_t* ord); /** * \brief Applies Group Normalization over a mini-batch of inputs. * @param[in] ctx Context environment.