From e16a3aee1f0be2ab08c49bb693440168d36dcbc0 Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Thu, 25 Jul 2024 16:58:56 +0800
Subject: [PATCH 1/7] add foreach_mul foreach_norm, foreach_unscale in cuda

---
 impl/torch/functions/functions.cpp | 52 ++++++++++++++++++++++++++++++
 proto/include/diopi/functions.h    | 37 ++++++++++++++++++++-
 2 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index 2a17e36424..3600608f1b 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -1165,6 +1165,15 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
+diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    auto atOther = impl::aten::buildAtScalar(other);
+    CALL_ATEN_CUDA_FUNC(_foreach_add_,atSelf,atOther);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiSub(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other,
                       const diopiScalar_t* alpha) {
     impl::aten::setCurStream(ctx);
@@ -1247,6 +1256,34 @@ diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
+diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    auto atOther = impl::aten::buildAtScalar(other);
+    CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) {
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    auto atOther = impl::aten::buildATen(other);
+    CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) {
+   impl::aten::setCurStream(ctx);
+   DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads)
+   auto atFoundInf = impl::aten::buildATen(found_inf);
+   auto atInvScale = impl::aten::buildATen(inv_scale);
+   CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale);
+
+   return diopiSuccess;
+}
+
 diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -3230,6 +3267,21 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC
     return diopiSuccess;
 }
 
+diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p) {
+    DIOPI_CHECK_PTR(out);
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    auto atP = impl::aten::buildAtScalar(p);
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atSelf, atP);
+    for (int i=0; i<selfSize; i++){
+        //impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]);
+        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    }
+    
+    return diopiSuccess;
+}
+
 diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t save_mean, diopiTensorHandle_t save_invstd,
                             diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t num_groups, double eps) {
     impl::aten::setCurStream(ctx);
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index 7c978453a8..4f3b3d1d1c 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -1106,6 +1106,14 @@ DIOPI_API diopiError_t diopiAddScalar(diopiContextHandle_t ctx, diopiTensorHandl
  */
 DIOPI_API diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha);
 
+/**
+ * @brief The in-place version of diopiForeachaddScalar.
+ * @param[in] ctx Context environment.
+ * @param[in] self the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] selfSize the length of the input tensor list. type = [int64].
+ * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
 /**
  * @brief  Perform subtraction operations between tensors.
  * @param[in] ctx Context environment.
@@ -1180,6 +1188,24 @@ DIOPI_API diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandl
  */
 DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other);
 
+/**
+ * @brief The in-place version of diopiForeachmulScalar.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
+ * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+
+/**
+ * @brief The in-place version of diopiForeachmulScalar.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
+ * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other);
+
 /**
  * @brief Divides each element of input tensor by the corresponding element in other tensor.
  * @param[in] ctx Context environment.
@@ -1188,7 +1214,8 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa
  * @param[in] rounding_mode Rounding mode applied to the result, None: no rounding is performed, if both input and other are integer types,
  * the inputs are promoted to the default scalar type; trunc: truncate towards zero; floor: round down towards negative infinity for the result of the division.
  * @param[out] out the output tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
- */
+*/
+
 DIOPI_API diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other,
                                 diopiRoundMode_t rounding_mode);
 
@@ -2888,6 +2915,14 @@ DIOPI_API diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t o
  */
 DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* p, diopiSize_t dim);
 
+/**
+ * @brief Returns the matrix norm or vector norm of a given tensor list.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tesnor list, type=[float32, float64, float16].
+ * @param[in] input the input tesnor list, type=[float32, float64, float16].
+ * @param[in] p an array, the order of norm.
+ */
+DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p);
 /**
  * \brief Applies Group Normalization over a mini-batch of inputs.
  * @param[in] ctx Context environment.

From 5e822e40090609b4c4e71652a317c8ebb1555b59 Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Fri, 26 Jul 2024 18:07:44 +0800
Subject: [PATCH 2/7] add foreachadd for torch version 20000

---
 impl/torch/functions/functions.cpp | 11 +++++++++++
 proto/include/diopi/functions.h    | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index 3600608f1b..f9402bbcc0 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -1165,6 +1165,17 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
+diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    for(int i=0; i<selfSize; i++) at::native::copy_(atOut[i], atSelf[i], true);
+    auto atOther = impl::aten::buildAtScalar(other);
+    CALL_ATEN_CUDA_FUNC(_foreach_add_,atOut,atOther);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
     impl::aten::setCurStream(ctx);
     DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index 4f3b3d1d1c..e47c80aa26 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -1106,6 +1106,16 @@ DIOPI_API diopiError_t diopiAddScalar(diopiContextHandle_t ctx, diopiTensorHandl
  */
 DIOPI_API diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other, const diopiScalar_t* alpha);
 
+/**
+ * @brief The diopiForeachaddScalar.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor list and will be store the result tensor.  type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] self the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] selfSize the length of the input tensor list. type = [int64].
+ * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+
 /**
  * @brief The in-place version of diopiForeachaddScalar.
  * @param[in] ctx Context environment.

From fcc465f9d361bc466d95313f425598e3c5d11c3f Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Mon, 29 Jul 2024 15:21:36 +0800
Subject: [PATCH 3/7] add foreach_mul.scalar foreach_mul.tensor for torch
 version 20000

---
 impl/torch/functions/functions.cpp | 28 ++++++++++++++++++++++++++++
 proto/include/diopi/functions.h    | 19 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index f9402bbcc0..c9be0e568d 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -1272,6 +1272,20 @@ diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandl
     DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
     auto atOther = impl::aten::buildAtScalar(other);
     CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther);
+    
+    return diopiSuccess;
+}
+
+diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+    DIOPI_CHECK_PTR(out);
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    auto atOther = impl::aten::buildAtScalar(other);
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther);
+    for (int i=0; i<selfSize; i++){
+        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    }
 
     return diopiSuccess;
 }
@@ -1285,6 +1299,20 @@ diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandl
     return diopiSuccess;
 }
 
+diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) {
+    DIOPI_CHECK_PTR(out);
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    auto atOther = impl::aten::buildATen(other);
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther);
+    for (int i=0; i<selfSize; i++){
+        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    }
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) {
    impl::aten::setCurStream(ctx);
    DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads)
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index e47c80aa26..e4ace539df 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -1198,6 +1198,16 @@ DIOPI_API diopiError_t diopiMulScalar(diopiContextHandle_t ctx, diopiTensorHandl
  */
 DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t input, const diopiScalar_t* other);
 
+/**
+ * @brief The diopiForeachmulScalar.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] input the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
+ * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+
 /**
  * @brief The in-place version of diopiForeachmulScalar.
  * @param[in] ctx Context environment.
@@ -1207,6 +1217,15 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa
  */
 DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
 
+/*
+ * @brief The in-place version of diopiForeachmulScalar.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
+ * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
+ */
+DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other);
+
 /**
  * @brief The in-place version of diopiForeachmulScalar.
  * @param[in] ctx Context environment.

From 57266f377747018c5c48a03885a5ce1836312d74 Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Tue, 30 Jul 2024 11:58:04 +0800
Subject: [PATCH 4/7] fix clang-format and improve code style.

---
 impl/torch/functions/functions.cpp | 97 ++++++++++++++++--------------
 proto/include/diopi/functions.h    | 50 ++++++++-------
 2 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index c9be0e568d..cda0ed467d 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -1165,22 +1165,25 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                   const diopiScalar_t* other) {
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
-    for(int i=0; i<selfSize; i++) at::native::copy_(atOut[i], atSelf[i], true);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
-    CALL_ATEN_CUDA_FUNC(_foreach_add_,atOut,atOther);
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_add, atInputs, atOther);
+    for (int i = 0; i < inputSize; i++) {
+        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
+    }
 
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other) {
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
-    CALL_ATEN_CUDA_FUNC(_foreach_add_,atSelf,atOther);
+    CALL_ATEN_CUDA_FUNC(_foreach_add_, atInputs, atOther);
 
     return diopiSuccess;
 }
@@ -1267,60 +1270,63 @@ diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
+diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other) {
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
-    CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther);
-    
+    CALL_ATEN_CUDA_FUNC(_foreach_mul_, atInputs, atOther);
+
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other) {
-    DIOPI_CHECK_PTR(out);
+diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                   const diopiScalar_t* other) {
+    DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
-    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther);
-    for (int i=0; i<selfSize; i++){
-        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther);
+    for (int i = 0; i < inputSize; i++) {
+        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
     }
 
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) {
+diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiConstTensorHandle_t other) {
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
     auto atOther = impl::aten::buildATen(other);
-    CALL_ATEN_CUDA_FUNC(_foreach_mul_,atSelf,atOther);
+    CALL_ATEN_CUDA_FUNC(_foreach_mul_, atInputs, atOther);
 
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other) {
-    DIOPI_CHECK_PTR(out);
+diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                   const diopiConstTensorHandle_t other) {
+    DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     auto atOther = impl::aten::buildATen(other);
-    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atSelf, atOther);
-    for (int i=0; i<selfSize; i++){
-        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther);
+    for (int i = 0; i < inputSize; i++) {
+        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
     }
 
     return diopiSuccess;
 }
 
-diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads, diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) {
-   impl::aten::setCurStream(ctx);
-   DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads)
-   auto atFoundInf = impl::aten::buildATen(found_inf);
-   auto atInvScale = impl::aten::buildATen(inv_scale);
-   CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale);
+diopiError_t diopiAmpForeachNonFiniteCheckAndUnscaleInp(diopiContextHandle_t ctx, diopiTensorHandle_t* scaled_grads, int64_t num_scaled_grads,
+                                                        diopiTensorHandle_t found_inf, diopiConstTensorHandle_t inv_scale) {
+    impl::aten::setCurStream(ctx);
+    DIOPI_IMPL_BUILD_ATEN_LIST(atScaledGrads, scaled_grads, num_scaled_grads)
+    auto atFoundInf = impl::aten::buildATen(found_inf);
+    auto atInvScale = impl::aten::buildATen(inv_scale);
+    CALL_ATEN_CUDA_FUNC(_amp_foreach_non_finite_check_and_unscale_, atScaledGrads, atFoundInf, atInvScale);
 
-   return diopiSuccess;
+    return diopiSuccess;
 }
 
 diopiError_t diopiGe(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other) {
@@ -3306,18 +3312,19 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC
     return diopiSuccess;
 }
 
-diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p) {
-    DIOPI_CHECK_PTR(out);
+diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                    const diopiScalar_t* p) {
+    DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atSelf, self, selfSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOut, out, selfSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
+    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     auto atP = impl::aten::buildAtScalar(p);
-    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atSelf, atP);
-    for (int i=0; i<selfSize; i++){
-        //impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]);
-        *(reinterpret_cast<at::Tensor*>(out[i])) = tempOut[i];
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP);
+    for (int i = 0; i < inputSize; i++) {
+        // impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]);
+        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
     }
-    
+
     return diopiSuccess;
 }
 
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index e4ace539df..902a3fadba 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -1109,21 +1109,22 @@ DIOPI_API diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHa
 /**
  * @brief The diopiForeachaddScalar.
  * @param[in] ctx Context environment.
- * @param[out] out the output tensor list and will be store the result tensor.  type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
- * @param[in] self the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
- * @param[in] selfSize the length of the input tensor list. type = [int64].
+ * @param[out] outs the output tensor list and will be store the result tensor.  type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+DIOPI_API diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                             const diopiScalar_t* other);
 
 /**
  * @brief The in-place version of diopiForeachaddScalar.
  * @param[in] ctx Context environment.
- * @param[in] self the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
- * @param[in] selfSize the length of the input tensor list. type = [int64].
+ * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+DIOPI_API diopiError_t diopiForeachaddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other);
 /**
  * @brief  Perform subtraction operations between tensors.
  * @param[in] ctx Context environment.
@@ -1201,39 +1202,42 @@ DIOPI_API diopiError_t diopiMulInpScalar(diopiContextHandle_t ctx, diopiTensorHa
 /**
  * @brief The diopiForeachmulScalar.
  * @param[in] ctx Context environment.
- * @param[out] out the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
- * @param[in] input the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[out] outs the output tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
  * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+DIOPI_API diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                             const diopiScalar_t* other);
 
 /**
  * @brief The in-place version of diopiForeachmulScalar.
  * @param[in] ctx Context environment.
- * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputs the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
  * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The scalar value to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* other);
+DIOPI_API diopiError_t diopiForeachmulInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiScalar_t* other);
 
 /*
- * @brief The in-place version of diopiForeachmulScalar.
+ * @brief The diopiForeachmulTensor
  * @param[in] ctx Context environment.
- * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] puts the output tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputs the input tensor list. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
  * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other);
+DIOPI_API diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                             const diopiConstTensorHandle_t other);
 
 /**
- * @brief The in-place version of diopiForeachmulScalar.
+ * @brief The in-place version of diopiForeachmulTensor.
  * @param[in] ctx Context environment.
- * @param[in] input the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
+ * @param[in] inputs the input tensor list and will be stored result tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
  * @param[in] inputSize the length of the input tensor list. type = [int64].
  * @param[in] other The tensor to be multiplied. type = [float64, float32, float16, int64, int32, int16, int8, uint8].
  */
-DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* self, int64_t selfSize, const diopiConstTensorHandle_t other);
+DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiTensorHandle_t* inputs, int64_t inputSize, const diopiConstTensorHandle_t other);
 
 /**
  * @brief Divides each element of input tensor by the corresponding element in other tensor.
@@ -1243,7 +1247,7 @@ DIOPI_API diopiError_t diopiForeachmulInpTensor(diopiContextHandle_t ctx, diopiT
  * @param[in] rounding_mode Rounding mode applied to the result, None: no rounding is performed, if both input and other are integer types,
  * the inputs are promoted to the default scalar type; trunc: truncate towards zero; floor: round down towards negative infinity for the result of the division.
  * @param[out] out the output tensor. type = [float64, float32, float16, int64, int32, int16, int8, uint8, bool].
-*/
+ */
 
 DIOPI_API diopiError_t diopiDiv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t other,
                                 diopiRoundMode_t rounding_mode);
@@ -2947,11 +2951,13 @@ DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t o
 /**
  * @brief Returns the matrix norm or vector norm of a given tensor list.
  * @param[in] ctx Context environment.
- * @param[out] out the output tesnor list, type=[float32, float64, float16].
- * @param[in] input the input tesnor list, type=[float32, float64, float16].
+ * @param[out] outs the output tesnor list, type=[float32, float64, float16].
+ * @param[in] inputs the input tesnor list, type=[float32, float64, float16].
+ * @param[in] inputSize the input size
  * @param[in] p an array, the order of norm.
  */
-DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t* self, int64_t selfSize, const diopiScalar_t* p);
+DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
+                                              const diopiScalar_t* p);
 /**
  * \brief Applies Group Normalization over a mini-batch of inputs.
  * @param[in] ctx Context environment.

From 981ea99e89e277503e240a33852a655fbe93f498 Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Wed, 31 Jul 2024 16:30:23 +0800
Subject: [PATCH 5/7] add tests in diopi_tests

---
 diopi_test/python/configs/diopi_configs.py    | 21 +++++++++
 .../configs/model_config/generate_config.py   |  3 +-
 .../python/conformance/diopi_functions.py     | 43 +++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/diopi_test/python/configs/diopi_configs.py b/diopi_test/python/configs/diopi_configs.py
index 2e268dd2c0..864bed22b6 100755
--- a/diopi_test/python/configs/diopi_configs.py
+++ b/diopi_test/python/configs/diopi_configs.py
@@ -4058,6 +4058,27 @@
     #         ],
     #     ),
     # ),
+    
+    'foreach_op': dict(
+        name=["_foreach_mul","_foreach_add"],
+        interface=["torch"],
+        para=dict(
+            scalar=[1.0, 5, 2.0, -1.2, 3, 10, 8, -0.5, 0, -2],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ["self"],
+                    "shape": ((), (10,), (10, 2, 5), (20,), (10, 5, 1), (20, 3, 4, 5), (20, 2, 3, 4, 5),
+                              (0,), (0, 10), (5, 0, 9)),
+                    "gen_fn": 'Genfunc.randn',
+                    "dtype": [np.float32, np.float16, np.float64],
+                    "gen_policy": 'gen_tensor_list',
+                    "gen_num_range": [1, 5]
+                },
+            ],
+        ),
+    ),
 
     'tril': dict(
         name=["tril"],
diff --git a/diopi_test/python/configs/model_config/generate_config.py b/diopi_test/python/configs/model_config/generate_config.py
index eed4409d5a..cf03619421 100644
--- a/diopi_test/python/configs/model_config/generate_config.py
+++ b/diopi_test/python/configs/model_config/generate_config.py
@@ -49,7 +49,8 @@
               'bitwise_or', 'sigmoid', 'erf', 'matmul', 'addcmul', 'std',
               'arange', 'log2', 'sign', 'eq', 'nonzero', 'triangular_solve',
               'ne', 'mul', 'linspace', 'index_fill', 'atan', 'le', 'sgn',
-              'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any'],
+              'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any',
+              '_foreach_add','_foreach_mul'],
     'torch.nn.functional': ['conv2d', 'batch_norm'],
     'torch.Tensor': ['fill_', 'repeat', 'unfold', 'copy_', 'expand'],
     'CustomizedTest': ['linalgqr', 'adadelta', 'cast_np', 'batch_norm_elemt',
diff --git a/diopi_test/python/conformance/diopi_functions.py b/diopi_test/python/conformance/diopi_functions.py
index 1ad454b53c..87dce0f26a 100644
--- a/diopi_test/python/conformance/diopi_functions.py
+++ b/diopi_test/python/conformance/diopi_functions.py
@@ -1643,6 +1643,49 @@ def clip_grad_norm_(tensors, max_norm, norm_type=2.0, error_if_nonfinite=False):
 
     return out.value
 
+def _foreach_add(self, scalar):
+    ctx = self[0].context()
+    num_tensors = len(self)
+    func = check_function("diopiForeachaddScalar")
+    input_tensors = list([TensorP(input) for input in self])
+    out_tensorV = list([Tensor(self[i].size(),self[i].get_dtype()) for i in range(num_tensors)])
+    out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV])
+    if isinstance(scalar, Tensor):
+        other = scalar
+    else:
+        other = Scalar(scalar)
+    ret = func(
+        ctx,
+        out_tensors,
+        input_tensors,
+        num_tensors,
+        other
+    )
+    check_returncode(ret)
+
+    return out_tensorV
+
+def _foreach_mul(self, scalar):
+    ctx = self[0].context()
+    num_tensors = len(self)
+    func = check_function("diopiForeachmulScalar")
+    input_tensors = list([TensorP(input) for input in self])
+    out_tensorV = list([Tensor(self[i].size(),self[i].get_dtype()) for i in range(num_tensors)])
+    out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV])
+    if isinstance(scalar, Tensor):
+        other = scalar
+    else:
+        other = Scalar(scalar)
+    ret = func(
+        ctx,
+        out_tensors,
+        input_tensors,
+        num_tensors,
+        other
+    )
+    check_returncode(ret)
+
+    return out_tensorV
 
 def batch_norm(
     input,

From 880ddf4f3b2b017c1641acefdb8a26b1feca32ac Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Wed, 31 Jul 2024 19:13:44 +0800
Subject: [PATCH 6/7] use updateATen2Tensor in functions.cpp(this will slow
 down the perf)

---
 impl/torch/functions/functions.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index cda0ed467d..afada5ceec 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -1168,12 +1168,11 @@ diopiError_t diopiAddInpScalar(diopiContextHandle_t ctx, diopiTensorHandle_t inp
 diopiError_t diopiForeachaddScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
                                    const diopiScalar_t* other) {
     impl::aten::setCurStream(ctx);
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
     auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_add, atInputs, atOther);
     for (int i = 0; i < inputSize; i++) {
-        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
+        impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]);
     }
 
     return diopiSuccess;
@@ -1284,11 +1283,10 @@ diopiError_t diopiForeachmulScalar(diopiContextHandle_t ctx, diopiTensorHandle_t
     DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
     DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     auto atOther = impl::aten::buildAtScalar(other);
     auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther);
     for (int i = 0; i < inputSize; i++) {
-        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
+        impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]);
     }
 
     return diopiSuccess;
@@ -1312,7 +1310,7 @@ diopiError_t diopiForeachmulTensor(diopiContextHandle_t ctx, diopiTensorHandle_t
     auto atOther = impl::aten::buildATen(other);
     auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_mul, atInputs, atOther);
     for (int i = 0; i < inputSize; i++) {
-        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
+        impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]);
     }
 
     return diopiSuccess;
@@ -3317,12 +3315,11 @@ diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_
     DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
     DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
-    DIOPI_IMPL_BUILD_ATEN_LIST(atOuts, outs, inputSize)
     auto atP = impl::aten::buildAtScalar(p);
     auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP);
     for (int i = 0; i < inputSize; i++) {
-        // impl::aten::updateATen2Tensor(ctx, tempOut[i], out[i]);
-        *(reinterpret_cast<at::Tensor*>(outs[i])) = tempOut[i];
+        //WARN NO NEED TO COPY HERE, WE NEED FASTER UPDATE HERE
+        impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]);
     }
 
     return diopiSuccess;

From 009dfcfec6f72a6a0a9ec9b46fc9063c0d988a72 Mon Sep 17 00:00:00 2001
From: lhy <442488254@qq.com>
Date: Wed, 31 Jul 2024 21:21:28 +0800
Subject: [PATCH 7/7] add tests for foreach_norm in diopi_tests

---
 diopi_test/python/configs/diopi_configs.py    | 19 ++++++++++++++++++-
 .../configs/model_config/generate_config.py   |  2 +-
 .../python/conformance/diopi_functions.py     | 19 +++++++++++++++++++
 impl/torch/functions/functions.cpp            |  7 +++----
 proto/include/diopi/functions.h               |  2 +-
 5 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/diopi_test/python/configs/diopi_configs.py b/diopi_test/python/configs/diopi_configs.py
index 864bed22b6..9e8379d935 100755
--- a/diopi_test/python/configs/diopi_configs.py
+++ b/diopi_test/python/configs/diopi_configs.py
@@ -4059,7 +4059,7 @@
     #     ),
     # ),
     
-    'foreach_op': dict(
+    'pointwise_binary_foreach_op': dict(
         name=["_foreach_mul","_foreach_add"],
         interface=["torch"],
         para=dict(
@@ -4079,6 +4079,23 @@
             ],
         ),
     ),
+ 
+    'foreach_norm': dict(
+        name=['_foreach_norm'],
+        interface=['torch'],
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ["self"],
+                    "shape": ((256, 512, 1, 1),(8, 1, 4),(256, 64, 1, 1),(10, 1, 4),(256, 128, 1, 1),(16, 1, 4),(256, 256, 1, 1),(3, 1, 4)),
+                    "dtype": [np.float32, np.float64, np.float16],
+                    "gen_fn": 'Genfunc.randn',
+                    "gen_policy": 'gen_tensor_list',
+                    "gen_num_range": [1, 5]
+                },
+            ],
+        ),
+    ),
 
     'tril': dict(
         name=["tril"],
diff --git a/diopi_test/python/configs/model_config/generate_config.py b/diopi_test/python/configs/model_config/generate_config.py
index cf03619421..e2030e8a4e 100644
--- a/diopi_test/python/configs/model_config/generate_config.py
+++ b/diopi_test/python/configs/model_config/generate_config.py
@@ -50,7 +50,7 @@
               'arange', 'log2', 'sign', 'eq', 'nonzero', 'triangular_solve',
               'ne', 'mul', 'linspace', 'index_fill', 'atan', 'le', 'sgn',
               'logical_and', 'permute', 'div', 'log10', 'roll', 'ge', 'lt', 'any',
-              '_foreach_add','_foreach_mul'],
+              '_foreach_add', '_foreach_mul', '_foreach_norm'],
     'torch.nn.functional': ['conv2d', 'batch_norm'],
     'torch.Tensor': ['fill_', 'repeat', 'unfold', 'copy_', 'expand'],
     'CustomizedTest': ['linalgqr', 'adadelta', 'cast_np', 'batch_norm_elemt',
diff --git a/diopi_test/python/conformance/diopi_functions.py b/diopi_test/python/conformance/diopi_functions.py
index 87dce0f26a..0e2c0d6ca1 100644
--- a/diopi_test/python/conformance/diopi_functions.py
+++ b/diopi_test/python/conformance/diopi_functions.py
@@ -1687,6 +1687,25 @@ def _foreach_mul(self, scalar):
 
     return out_tensorV
 
+def _foreach_norm(self):
+    ctx = self[0].context()
+    num_tensors = len(self)
+    func = check_function("diopiForeachnormScalar")
+    input_tensors = list([TensorP(input) for input in self])
+    out_tensorV = list([Tensor([],self[i].get_dtype()) for i in range(num_tensors)])
+    out_tensors = list([TensorP(out_tensor) for out_tensor in out_tensorV])
+    other = Scalar(2)
+    ret = func(
+        ctx,
+        out_tensors,
+        input_tensors,
+        num_tensors,
+        other
+    )
+    check_returncode(ret)
+
+    return out_tensorV
+
 def batch_norm(
     input,
     running_mean,
diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index afada5ceec..8c25376053 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -3311,14 +3311,13 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC
 }
 
 diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
-                                    const diopiScalar_t* p) {
+                                    const diopiScalar_t* ord) {
     DIOPI_CHECK_PTR(outs);
     impl::aten::setCurStream(ctx);
     DIOPI_IMPL_BUILD_ATEN_LIST(atInputs, inputs, inputSize)
-    auto atP = impl::aten::buildAtScalar(p);
-    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atP);
+    auto atOrd = impl::aten::buildAtScalar(ord);
+    auto tempOut = CALL_ATEN_CUDA_FUNC(_foreach_norm, atInputs, atOrd);
     for (int i = 0; i < inputSize; i++) {
-        //WARN NO NEED TO COPY HERE, WE NEED FASTER UPDATE HERE
         impl::aten::updateATen2Tensor(ctx, tempOut[i], outs[i]);
     }
 
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index 902a3fadba..891f46f305 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -2957,7 +2957,7 @@ DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t o
  * @param[in] p an array, the order of norm.
  */
 DIOPI_API diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
-                                              const diopiScalar_t* p);
+                                              const diopiScalar_t* ord);
 /**
  * \brief Applies Group Normalization over a mini-batch of inputs.
  * @param[in] ctx Context environment.