diff --git a/diopi_test/python/configs/diopi_configs.py b/diopi_test/python/configs/diopi_configs.py
index 34f43f67d..0ad81385e 100755
--- a/diopi_test/python/configs/diopi_configs.py
+++ b/diopi_test/python/configs/diopi_configs.py
@@ -743,6 +743,202 @@
         ),
     ),
 
+    'pool1d': dict(
+        name=['pool1d'],
+        interface=['CustomizedTest'],
+        para=dict(
+            kernel_size=[2, 2, 6, 2, 3, 6, 5, 0, 0, 0, 0],
+            stride=[None, None, 3, 1, 2, None, 2, 0, 0, 0, 0],
+            padding=[0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0],
+            dilation=[0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0],
+            ceil_mode=[False, True, False, True, False, False, True, False, False, False, False],
+            count_include_pad=[True, True, False, True, False, False, False, False, False, False, False],
+            mode=["avg", "avg", "avg", "avg", "avg", "max", "max", "avg", "avg", "max", "max"],
+            adaptive=[False, False, False, False, False, False, False, True, True, True, True],
+            output_size=[0, 0, 0, 0, 0, 0, 0, 5, 26, 3, 2]
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((2, 16), (5, 2, 16), (3, 4, 16),
+                              (2, 1024, 14), (256, 28, 28), (3, 12), (5, 4, 17), (3, 16), (4, 7, 27), (4, 16), (288, 33)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ), 
+
+    'avg_pool1d': dict(
+        name=["avg_pool1d"],
+        para=dict(
+            kernel_size=[2, 2, 6, 2, 3],
+            stride=[None, None, 3, 1, 2],
+            padding=[0, 0, 2, 1, 0],
+            ceil_mode=[False, True, False, True, False],
+            count_include_pad=[True, True, False, True, False],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((2, 16), (5, 2, 16), (3, 4, 16),
+                              (2, 1024, 14), (256, 28, 28)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ), 
+
+    'max_pool1d': dict(
+        name=["max_pool1d"],
+        para=dict(
+            kernel_size=[6, 5, 6, 8, 3, 2, 3, 3],
+            stride=[None, 2, 2, 3, 2, 2, 1, 2],
+            padding=[0, 2, 2, 3, 1, 0, 1, 0],
+            dilation=[1, 3, 2, 2, 1, 1, 1, 2],
+            ceil_mode=[False, True, False, True, False, True, False, True],
+            return_indices=[False, False, False, False, False, False, False, False],
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((3, 12), (5, 4, 17),
+                              (6, 17), (1, 4, 17),
+                              (2, 64, 352),
+                              (2, 256, 12),
+                              (2, 512, 4),
+                              (3, 4)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ),
+
+    'max_pool1d_return_indices': dict(
+        name=["max_pool1d"],
+        para=dict(
+            kernel_size=[6, 6, 8, 8],
+            stride=[None, 3, 3, 2],
+            padding=[0, 1, 2, 3],
+            dilation=[1, 4, 2, 3],
+            ceil_mode=[False, True, False, True],
+            return_indices=[True, True, True, True],
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((3, 12), (5, 4, 17),
+                              (6, 17), (1, 4, 17),),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+        requires_backward=[0],
+    ),
+
+    'adaptive_avg_pool1d': dict(
+        name=["adaptive_avg_pool1d"],
+        atol=1e-5,
+        rtol=1e-4,
+        atol_half=1e-2,
+        rtol_half=1e-2,
+        para=dict(
+            output_size=[5, 26, 3, 1, 2,
+                         1, 3, 7, 10],
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((3, 16), (4, 7, 27), (4, 16),
+                              (2, 2048, 8), (2, 288, 33),
+                              (2, 144, 65), (2, 1280, 7),
+                              (2, 265, 7), (2, 265, 7)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ),
+
+    'adaptive_max_pool1d': dict(
+        name=["adaptive_max_pool1d"],
+        atol=1e-5,
+        rtol=1e-4,
+        para=dict(
+            output_size=[5, 26, 3, 2, 1, 3, 33, 40],
+            return_indices=[False, False, False, False, False, False, False, False]
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((3, 16), (4, 7, 27), (4, 16),
+                              (288, 33), (2, 144, 33), (2, 16, 130),
+                              (2, 144, 33), (2, 144, 33)),
+                    "dtype": [np.float32, np.float16, np.float64],
+                },
+            ]
+        ),
+    ),
+
+    'adaptive_max_pool1d_return_indices': dict(
+        name=["adaptive_max_pool1d"],
+        atol=1e-5,
+        rtol=1e-4,
+        para=dict(
+            output_size=[5, 26, 3, 0],
+            return_indices=[True, True, True, True]
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((3, 16), (4, 7, 27), (4, 16), (4, 16)),
+                    "dtype": [np.float32, np.float16, np.float64],
+                },
+            ]
+        ),
+    ),
+
+    'pool2d': dict(
+        name=['pool2d'],
+        interface=['CustomizedTest'],
+        para=dict(
+            kernel_size=[2, 2, 6, 2, 3, 6, 5, 0, 0, 0, 0],
+            stride=[None, None, 3, 1, 2, None, 2, 0, 0, 0, 0],
+            padding=[0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0],
+            dilation=[0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0],
+            ceil_mode=[False, True, False, True, False, False, True, False, False, False, False],
+            count_include_pad=[True, True, False, True, False, False, False, False, False, False, False],
+            mode=["avg", "avg", "avg", "avg", "avg", "max", "max", "avg", "avg", "max", "max"],
+            adaptive=[False, False, False, False, False, False, False, True, True, True, True],
+            output_size=[0, 0, 0, 0, 0, 0, 0, 5, 26, 3, 2]
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((2, 16, 16), (5, 2, 16, 16), (3, 4, 16, 16),
+                              (2, 1024, 14, 16), (256, 28, 28, 16), (3, 12, 12), (5, 4, 17, 17), (3, 16, 16), (4, 7, 27, 27), (4, 16, 17), (288, 33, 33)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ), 
     'avg_pool2d': dict(
         name=["avg_pool2d"],
         para=dict(
@@ -1025,7 +1221,7 @@
     ),
 
     'pointwise_op': dict(
-        name=['abs', 'cos', 'erf', 'erfinv', 'exp', 'floor',
+        name=['abs', 'cos', 'acos', 'tan', 'erf', 'erfinv', 'exp', 'expm1', 'floor',
               'neg', 'sin', 'asin', 'sqrt', 'logical_not', 'rsqrt', 'ceil', 'atan'],
         interface=['torch'],
         is_inplace=True,
@@ -1047,7 +1243,7 @@
     'pointwise_op_int_without_inplace': dict(
         # name=['abs', 'cos', 'erf', 'erfinv', 'exp',
         #       'neg', 'sin', 'asin', 'sqrt', 'logical_not', 'rsqrt', 'atan'],
-        name=['abs', 'cos', 'erf', 'exp',
+        name=['abs', 'cos', 'acos', 'tan', 'erf', 'exp', 'expm1',
               'neg', 'sin', 'asin', 'sqrt', 'logical_not', 'rsqrt', 'atan'],
         interface=['torch'],
         dtype=[np.int16, np.int32, np.int64, np.int8],
@@ -1068,7 +1264,7 @@
     'pointwise_op_uint8': dict(
         # name=['abs', 'cos', 'erf', 'erfinv', 'exp',
         #       'neg', 'sin', 'asin', 'sqrt', 'logical_not', 'rsqrt', 'atan'],
-        name=['abs', 'cos', 'erf', 'exp',
+        name=['abs', 'cos', 'acos', 'tan', 'erf', 'exp', 'expm1',
               'neg', 'sin', 'asin', 'sqrt', 'logical_not', 'rsqrt', 'atan'],
         interface=['torch'],
         dtype=[np.uint8],
@@ -1103,7 +1299,7 @@
     # FIXME erfinv输入int或bool报错
     'pointwise_op_bool': dict(
         # name=['abs', 'cos', 'erf', 'erfinv', 'exp', 'sin', 'asin', 'sqrt', 'rsqrt', 'atan', 'logical_not'],
-        name=['abs', 'cos', 'erf', 'exp', 'sin', 'asin', 'sqrt', 'rsqrt', 'atan', 'logical_not'],
+        name=['abs', 'cos', 'acos', 'tan', 'erf', 'exp', 'expm1', 'sin', 'asin', 'sqrt', 'rsqrt', 'atan', 'logical_not'],
         interface=['torch'],
         dtype=[np.bool_],
         tensor_para=dict(
@@ -1192,6 +1388,43 @@
         ),
     ),
 
+    'hyperbolic_trigo_function_without_backward': dict(
+        name=['sinh', 'cosh', 'asinh', 'acosh', 'atanh'],
+        interface=['torch'],
+        is_inplace=True,
+        dtype=[np.float16, np.float32, np.float64],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((), (1, ), (1024,), (364800, 4), (2, 128, 3072),
+                              (256, 128, 3, 3),
+                              (2, 31, 512, 6, 40),
+                              (0,), (16, 0), (1, 0, 6)),
+                },
+            ],
+        ),
+    ),
+
+    'hyperbolic_trigo_function_without_backward_not_float': dict(
+        name=['sinh', 'cosh', 'asinh', 'acosh', 'atanh'],
+        interface=['torch'],
+        dtype=[np.int16, np.int32, np.int64, np.uint8, np.int8, np.bool_],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((), (1, ), (1024,), (364800, 4), (2, 128, 3072),
+                              (256, 128, 3, 3),
+                              (2, 31, 512, 6, 40),
+                              (0,), (16, 0), (1, 0, 6)),
+                },
+            ],
+        ),
+    ),
+
     'tanh': dict(
         name=['tanh'],
         interface=['torch'],
@@ -1250,7 +1483,7 @@
     ),
 
     'pointwise_op_zero': dict(
-        name=['abs', 'exp', 'floor', 'neg', 'sqrt',
+        name=['abs', 'exp', 'expm1', 'floor', 'neg', 'sqrt',
               'logical_not', 'rsqrt', 'ceil'],
         interface=['torch'],
         is_inplace=True,
@@ -1267,7 +1500,7 @@
     ),
 
     'pointwise_op_without_inplace_zero': dict(
-        name=['abs', 'sign', 'exp', 'sqrt',
+        name=['abs', 'sign', 'exp', 'expm1', 'sqrt',
               'logical_not', 'rsqrt'],
         interface=['torch'],
         dtype=[np.float16, np.float32, np.float64, np.int16,
@@ -4347,6 +4580,88 @@
         ),
     ),
 
+    'sort_stable_for_backward': dict(
+        name=["sort"],
+        interface=['CustomizedTest'],
+        saved_args=dict(indice=1),
+        requires_backward=[0],
+        para=dict(
+            dim=[-1, 0, 1],
+            descending=[True, False, False],
+            stable=[True, True, True],
+        ),
+        dtype=[np.float16, np.float32],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((11400, ),
+                              (4, 4, 16, 20),
+                              (4, 4, 16, 2, 20)),
+                },
+            ],
+        ),
+    ),
+
+    'complex': dict(
+        name=["complex"],
+        interface=['torch'],
+        dtype=[np.float32, np.float64],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['real'],
+                    "shape": ((11400, ),
+                              (4, 4, 16, 20),
+                              (4, 4, 16, 2, 20)),
+                },
+                {
+                    "ins": ['imag'],
+                    "shape": ((11400, ),
+                              (4, 4, 16, 20),
+                              (4, 4, 16, 2, 20)),
+                },
+            ],
+        ),
+    ),
+
+    'conj': dict(
+        name=["conj"],
+        interface=['torch'],
+        dtype=[np.float32, np.float64, np.complex64, np.complex128],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((11400, ),
+                              (4, 4, 16, 20),
+                              (4, 4, 16, 2, 20)),
+                },
+            ],
+        ),
+    ),
+
+    'real_imag': dict(
+        name=["real", "imag"],
+        interface=['torch'],
+        dtype=[np.complex64, np.complex128],
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((11400, ),
+                              (4, 4, 16, 20),
+                              (4, 4, 16, 2, 20)),
+                },
+            ],
+        ),
+    ),
+
     # FIXME topk输入0-d张量，且k为0时，结果精度不一致
     'topk_nonzero': dict(
         name=['topk'],
@@ -5386,6 +5701,49 @@
         ),
     ),
 
+    'conv_transpose3d': dict(
+        name=["conv_transpose3d"],
+        atol=1e-3,
+        rtol=1e-3,
+        atol_half=1e2,
+        rtol_half=1e2,
+        para=dict(
+            stride=[1, 1, 2, 1, 2, (2, 2, 2), 1],
+            padding=[0, 0, 1, 0, 1, (1, 0, 1), 0],
+            output_padding=[0, 0, 1, 0, 1, (0, 1, 1), 0],
+            groups=[1, 1, 8, 1, 1, 1, 1],
+            dilation=[1, 1, 2, 1, 2, (1, 2, 2), 1],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ["input"],
+                    "requires_grad": [True],
+                    "shape": ((6, 16, 20, 8, 5),
+                              (2, 256, 14, 14, 5), (2, 128, 32, 32, 4),
+                              (2, 64, 160, 160, 5), (2, 64, 320, 320, 5), (2, 64, 320, 320, 5),
+                              (0, 16, 20, 8, 5)),
+                    "dtype": [np.float32, np.float64, np.float16],
+                },
+                {
+                    "ins": ["weight"],
+                    "requires_grad": [True],
+                    "shape": ((16, 2, 12, 2, 2),
+                              (256, 256, 2, 2, 2), (128, 128, 4, 4, 4),
+                              (64, 64, 2, 2, 2), (64, 1, 2, 2, 2), (64, 1, 2, 2, 2),
+                              (16, 2, 12, 2, 2)),
+                    "dtype": [np.float32, np.float64, np.float16],
+                },
+                {
+                    "ins": ["bias"],
+                    "requires_grad": [True],
+                    "shape": (None, (256,), None, (64,), (1,), (1,), None),
+                    "dtype": [np.float32, np.float64, np.float16],
+                },
+            ]
+        ),
+    ),
+
     'unfold': dict(
         name=["unfold"],
         interface=['torch.Tensor'],
@@ -5455,6 +5813,29 @@
         ),
     ),
 
+    'cumsum_float_for_backward': dict(
+        name=["cumsum"],
+        interface=['torch'],
+        atol=1e-6,
+        rtol=1e-5,
+        requires_backward=[0],
+        dtype=[np.float32],
+        para=dict(
+            dim=[0, -1, 1],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((), (12,), (2, 22, 33)),
+                    "dtype": [np.float32, np.float64, np.float16],
+                    "gen_fn": 'Genfunc.randn',
+                },
+            ],
+        ),
+    ),
+
     'cdist': dict(
         name=['cdist'],
         interface=['torch'],
@@ -5601,6 +5982,88 @@
         ),
     ),
 
+    'argmin': dict(
+        name=['argmin'],
+        interface=["torch"],
+        para=dict(
+            dim=[0, -1, 0, 1, None, -2, 2, 1],
+            keepdim=[True, False, True, False, False, True, True, False],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((), (1,), (1024, 80), (2, 256, 256), (2, 1, 64, 64),
+                              (12, 0), (2, 0, 9), (0, 9, 8, 7)),
+                    "dtype": [np.float64, np.float16, np.float32, np.int32, np.int16,
+                              np.int64, np.uint8, np.int8],
+                    "gen_fn": 'Genfunc.randn',
+                },
+            ],
+        ),
+    ),
+
+    'argmin_same_value': dict(
+        name=['argmin'],
+        interface=["torch"],
+        para=dict(
+            dim=[-1, 0, None, 1],
+            keepdim=[True, False, True, False],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((1,), (1024, 80), (2, 256, 256), (2, 1, 64, 64)),
+                    "dtype": [np.float32],
+                    "gen_fn": 'Genfunc.zeros',
+                },
+            ],
+        ),
+    ),
+
+    'argsort': dict(
+        name=['argsort'],
+        interface=["torch"],
+        para=dict(
+            dim=[0, -1, 0, 1, -1, 0, 2, 1],
+            stable=[True, False, True, False, False, True, True, False],
+            descending=[True, False, True, False, False, True, True, False],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((), (1,), (1024, 80), (2, 256, 256), (2, 1, 64, 64),
+                              (12, 0), (2, 0, 9), (0, 9, 8, 7)),
+                    "dtype": [np.float64, np.float16, np.float32, np.int32, np.int16,
+                              np.int64, np.uint8, np.int8],
+                    "gen_fn": 'Genfunc.randn',
+                },
+            ],
+        ),
+    ),
+
+    'argsort_same_value': dict(
+        name=['argsort'],
+        interface=["torch"],
+        para=dict(
+            dim=[-1, 0, -1, 1],
+            stable=[True, False, True, False],
+            descending=[True, False, True, False],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((1,), (1024, 80), (2, 256, 256), (2, 1, 64, 64)),
+                    "dtype": [np.float32],
+                    "gen_fn": 'Genfunc.zeros',
+                },
+            ],
+        ),
+    ),
+
     'adadelta': dict(
         name=["adadelta"],
         interface=["CustomizedTest"],
@@ -5790,6 +6253,56 @@
         ),
     ),
 
+    'pool3d': dict(
+        name=['pool3d'],
+        interface=['CustomizedTest'],
+        para=dict(
+            kernel_size=[2, 2, 6, 2, 3, 6, 5, 0, 0, 0, 0],
+            stride=[None, None, 3, 1, 2, None, 2, 0, 0, 0, 0],
+            padding=[0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0],
+            dilation=[0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0],
+            ceil_mode=[False, True, False, True, False, False, True, False, False, False, False],
+            count_include_pad=[True, True, False, True, False, False, False, False, False, False, False],
+            mode=["avg", "avg", "avg", "avg", "avg", "max", "max", "avg", "avg", "max", "max"],
+            adaptive=[False, False, False, False, False, False, False, True, True, True, True],
+            output_size=[0, 0, 0, 0, 0, 0, 0, 5, 26, 3, 2]
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((2, 16, 16, 16), (5, 2, 16, 16, 17), (3, 4, 16, 16, 18),
+                              (2, 1024, 14, 16, 20), (256, 28, 28, 16, 17), (3, 12, 12, 18), (5, 4, 17, 17, 21), (3, 16, 16, 20), (4, 7, 27, 27, 26), (4, 16, 17, 23), (288, 33, 33, 35)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ), 
+
+    'avg_pool3d': dict(
+        name=["avg_pool3d"],
+        para=dict(
+            kernel_size=[2, (2, 2, 2), (20, 13, 13), (2, 2, 2), 3],
+            stride=[None, None, 3, 1, (1, 2, 2)],
+            padding=[0, (0, 0, 0), (2, 3, 2), (1, 1, 1), 0],
+            ceil_mode=[False, True, False, True, False],
+            count_include_pad=[True, True, False, True, False],
+            divisor_override=[None, None, -3, None, 2],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "requires_grad": [True],
+                    "shape": ((2, 16, 7, 7), (5, 2, 16, 7, 7), (3, 4, 23, 23, 23),
+                              (2, 1024, 14, 14, 14), (256, 28, 28, 28)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                },
+            ]
+        ),
+    ),
+
     'max_pool3d': dict(
         name=['max_pool3d'],
         para=dict(
@@ -6420,6 +6933,7 @@
                               (0,), (0, 12), (13, 0, 4)),
                     "dtype": [np.float32, np.float64, np.float16],
                     "gen_fn": 'Genfunc.randn',
+                    "requires_grad": [True],
                 },
             ],
         ),
@@ -7472,6 +7986,7 @@
         ),
     ),
 
+    #This test config is for the national standard GB operator version of diopiLayerNorm, which is different from the original interface definition.
     'layer_norm': dict(
         name=["layer_norm"],
         dtype=[np.float32, np.float64, np.float16],
@@ -7508,6 +8023,81 @@
         )
     ),
 
+    'instance_norm': dict(
+        name=["instance_norm"],
+        dtype=[np.float32, np.float64],
+        atol=1e-5,
+        atol_half=1e-1,
+        rtol_half=1e-2,
+        para=dict(
+            eps=[1e-5, 1e-5, 1e-12, 0, -1e-5, 2],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": ((3, 5, 3, 5), (2, 16, 128), (2, 64, 16), (2, 32, 16),
+                              (4, 5, 3, 5), (5, 16, 128)),
+                    'gen_fn':'Genfunc.randn',
+                    'requires_grad': [True],
+                },
+                {
+                    "ins": ["weight"],
+                    "shape": ((5,), (16,), (64,), (32,),
+                              (5,), (16,)),
+                    'gen_fn':'Genfunc.randn',
+                    'requires_grad': [True],
+                },
+                {
+                    "ins": ["bias"],
+                    "shape": ((5,), (16,), (64,), (32,),
+                              (5,), (16,)),
+                    'gen_fn':'Genfunc.randn',
+                    'requires_grad': [True],
+                },
+            ]
+        )
+    ),
+
+    'layer_normGB': dict(
+        name=["layer_normGB"],
+        interface=['CustomizedTest'],
+        dtype=[np.float32, np.float64, np.float16],
+        atol=1e-5,
+        atol_half=1e-1,
+        rtol_half=1e-2,
+        para=dict(
+            eps=[1e-5, 1e-5, 1e-12, 0, -1e-5, 2],
+            normalized_shape=[(5, 3, 5), (128, ), (64, ), (32,),
+                              (3, 5), (2, 16, 128)],
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ["input"],
+                    "requires_grad": [True],
+                    "shape": ((2, 5, 3, 5), (2, 3136, 128), (2, 64), (32,),
+                              (2, 5, 3, 5), (2, 16, 128)),
+                },
+                {
+                    "ins": ["weight"],
+                    "requires_grad": [True],
+                    "shape": (None, (128,), (64,), (32,),
+                              (3, 5), (2, 16, 128)),
+                },
+                {
+                    "ins": ["bias"],
+                    "requires_grad": [True],
+                    "shape": (None, (128,), (64,), (32,),
+                              (3, 5), (2, 16, 128)),
+                },
+            ]
+        )
+    ),
+
+
+
     'layer_norm_empty_tensor': dict(
         name=["layer_norm"],
         dtype=[np.float32, np.float64, np.float16],
@@ -7535,6 +8125,28 @@
         )
     ),
 
+    'normalize': dict(
+        name=["normalize"],
+        interface=['torch.nn.functional'],
+        dtype=[np.float32, np.float64, np.float16],
+        atol=1e-5,
+        para=dict(
+            eps=[1e-2, 1e-8, -3],
+            p=[1, 2, 3],
+            dim=[1, 1, 1],
+        ),
+        tensor_para=dict(
+            gen_fn='Genfunc.randn',
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": ((3, 3), (3, 12), (6, 3, 9)),
+                    "requires_grad": [True],
+                },
+            ]
+        )
+    ),
+
     'copy': dict(
         name=["copy_"],
         interface=['torch.Tensor'],
@@ -8061,6 +8673,31 @@
         ),
     ),
 
+    'grid_sample': dict(
+        name=["grid_sample"],
+        interface=['torch.nn.functional'],
+        para=dict(
+            mode=["bilinear", "nearest", "bilinear", "nearest"],
+        ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": ((2, 3, 15, 15), (3, 3, 20, 20, 20), (2, 3, 25, 25), (3, 3, 30, 30, 30)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                    "gen_fn": 'Genfunc.randn',
+                },
+                {
+                    "ins": ['grid'],
+                    "shape": ((2, 5, 5, 2), (3, 10, 10, 10, 3), (2, 20, 20, 2), (3, 60, 60, 60, 3)),
+                    "dtype": [np.float16, np.float32, np.float64],
+                    "gen_fn": 'Genfunc.randn',
+                    "gen_num_range": [1, 19],
+                },                
+            ],
+        ),
+    ),
+
     'multinomial': dict(
         name=["multinomial"],
         interface=['torch'],
diff --git a/diopi_test/python/conformance/customized_test.py b/diopi_test/python/conformance/customized_test.py
index 9cd10f905..3f351e27c 100644
--- a/diopi_test/python/conformance/customized_test.py
+++ b/diopi_test/python/conformance/customized_test.py
@@ -857,3 +857,37 @@ def nll_loss_v2(input, target, weight=None, ignore_index=-100, reduction="mean")
             input, target, weight, None, ignore_index, None, reduction
         )
         return out
+
+    def pool1d(input, kernel_size, stride, padding, dilation, ceil_mode, count_include_pad, mode, adaptive, output_size):
+        if mode == "avg" and adaptive == False:
+            return torch.nn.functional.avg_pool1d(input, kernel_size, stride, padding, ceil_mode, count_include_pad)
+        elif mode == "max" and adaptive == False:
+            return torch.nn.functional.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices=False)
+        elif mode == "avg" and adaptive == True:
+            return torch.nn.functional.adaptive_avg_pool1d(input, output_size)
+        elif mode == "max" and adaptive == True:
+            return torch.nn.functional.adaptive_max_pool1d(input, output_size, return_indices=False)
+
+    def pool2d(input, kernel_size, stride, padding, dilation, ceil_mode, count_include_pad, mode, adaptive, output_size):
+        if mode == "avg" and adaptive == False:
+            return torch.nn.functional.avg_pool2d(input, kernel_size, stride, padding, ceil_mode, count_include_pad)
+        elif mode == "max" and adaptive == False:
+            return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices=False)
+        elif mode == "avg" and adaptive == True:
+            return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
+        elif mode == "max" and adaptive == True:
+            return torch.nn.functional.adaptive_max_pool2d(input, output_size, return_indices=False)
+
+    def pool3d(input, kernel_size, stride, padding, dilation, ceil_mode, count_include_pad, mode, adaptive, output_size):
+        if mode == "avg" and adaptive == False:
+            return torch.nn.functional.avg_pool3d(input, kernel_size, stride, padding, ceil_mode, count_include_pad)
+        elif mode == "max" and adaptive == False:
+            return torch.nn.functional.max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices=False)
+        elif mode == "avg" and adaptive == True:
+            return torch.nn.functional.adaptive_avg_pool3d(input, output_size)
+        elif mode == "max" and adaptive == True:
+            return torch.nn.functional.adaptive_max_pool3d(input, output_size, return_indices=False)
+
+    def layer_normGB(input, weight, bias, eps, normalized_shape):
+        return torch.nn.functional.layer_norm(input=input, weight=weight, bias=bias, eps=eps, normalized_shape=normalized_shape)
+
diff --git a/diopi_test/python/conformance/diopi_functions.py b/diopi_test/python/conformance/diopi_functions.py
index 297240cb0..3b35e6fc9 100644
--- a/diopi_test/python/conformance/diopi_functions.py
+++ b/diopi_test/python/conformance/diopi_functions.py
@@ -405,6 +405,22 @@ def cos(input, inplace=False) -> Tensor:
     return unary_op(input, inplace, "diopiCos", promote_type(input, Dtype.float32))
 
 
+def acos(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiAcos", promote_type(input, Dtype.float32))
+
+
+def tan(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiTan", promote_type(input, Dtype.float32))
+
+
+def sinh(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiSinh", promote_type(input, Dtype.float32))
+
+
+def cosh(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiCosh", promote_type(input, Dtype.float32))
+
+
 def tanh(input, inplace=False) -> Tensor:
     return unary_op(input, inplace, "diopiTanh", promote_type(input, Dtype.float32))
 
@@ -413,10 +429,25 @@ def atan(input, inplace=False) -> Tensor:
     return unary_op(input, inplace, "diopiAtan", promote_type(input, Dtype.float32))
 
 
+def asinh(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiAsinh", promote_type(input, Dtype.float32))
+
+
+def acosh(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiAcosh", promote_type(input, Dtype.float32))
+
+def atanh(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiAtanh", promote_type(input, Dtype.float32))
+
+
 def exp(input, inplace=False) -> Tensor:
     return unary_op(input, inplace, "diopiExp", promote_type(input, Dtype.float32))
 
 
+def expm1(input, inplace=False) -> Tensor:
+    return unary_op(input, inplace, "diopiExpm1", promote_type(input, Dtype.float32))
+
+
 def log(input, inplace=False) -> Tensor:
     return unary_op(input, inplace, "diopiLog", promote_type(input, Dtype.float32))
 
@@ -1044,7 +1075,915 @@ def conv2d(
     check_returncode(ret)
     return out
 
+def pool1d(input, kernel_size=0, stride=None, padding=0, dilation=1, ceil_mode=False, count_include_pad=True, output_size=0, mode="avg", adaptive=False) -> Tensor:
+
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 3:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+    if isinstance(dilation, int):
+        dilation = (dilation,)
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+
+    if mode == "avg" and adaptive == False:
+        for i in range(-1, 0):
+            if ceil_mode:
+                sizeO.append(
+                    math.ceil((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+            else:
+                sizeO.append(
+                    math.floor((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+    
+        func = check_function("diopiPool1d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "max" and adaptive == False:
+        for i in range(-1, 0):
+            tmp_ker_size = kernel_size[i] + (kernel_size[i] - 1) * (dilation[i] - 1)
+            tmp_size = (sizeI[i] - tmp_ker_size + 2 * padding[i]) / stride[i] + 1
+            tmp_size = tmp_size if tmp_size > 1 else 1
+            if ceil_mode:
+                sizeO.append(math.ceil(tmp_size))
+            else:
+                sizeO.append(math.floor(tmp_size))
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)  
+
+        func = check_function("diopiPool1d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "avg" and adaptive == True:
+        for i in range(-1, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-1],]))
+    
+        func = check_function("diopiPool1d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    else:
+        for i in range(-1, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-1],]))
+    
+        func = check_function("diopiPool1d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    
+        
+def pool1d_backward(input, grad_outputs, kernel_size=0, stride=0, padding=0, dilation=0, ceil_mode=0, count_include_pad=True, mode="avg", adaptive=False, output_size=0, **kwargs,) -> Tensor:
+
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    grad_input = raw_like(input)
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+    if isinstance(dilation, int):
+        dilation = (dilation,)
+    if isinstance(output_size, int):
+        output_size = (output_size, )
+
+    
+    if mode == "avg" and adaptive == False:
+       
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool1dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == False:
+
+        _, indices = max_pool1d(
+           input, kernel_size, stride, padding, dilation, ceil_mode, True
+        )
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+    
+        func = check_function("diopiPool1dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "avg" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool1dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        _, indices = adaptive_max_pool1d(input, output_size, return_indices=True)
+    
+        func = check_function("diopiPool1dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+           
+def avg_pool1d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+) -> Tensor:
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 3:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+
+    for i in range(-1, 0):
+        if ceil_mode:
+            sizeO.append(
+                math.ceil((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+            )
+        else:
+            sizeO.append(
+                math.floor((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+            )
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+    nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+    out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+
+    func = check_function("diopiAvgPool1d")
+    ret = func(
+            input.context(),
+            out,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+    )
+    check_returncode(ret)
+    return out
+
+def avg_pool1d_backward(
+    input,
+    grad_outputs,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    **kwargs,
+) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    grad_input = raw_like(input)
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+
+    func = check_function("diopiAvgPool1dBackward")
+    ret = (
+        func(
+            input.context(),
+            grad_input,
+            grad_outputs[0],
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+        )
+    )
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+def max_pool1d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+    return_indices=False,
+) -> Tensor:
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 3:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+    if isinstance(dilation, int):
+        dilation = (dilation,)
+
+    for i in range(-1, 0):
+        tmp_ker_size = kernel_size[i] + (kernel_size[i] - 1) * (dilation[i] - 1)
+        tmp_size = (sizeI[i] - tmp_ker_size + 2 * padding[i]) / stride[i] + 1
+        tmp_size = tmp_size if tmp_size > 1 else 1
+        if ceil_mode:
+            sizeO.append(math.ceil(tmp_size))
+        else:
+            sizeO.append(math.floor(tmp_size))
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+    dilation = Sizes(list(dilation))
+    nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+    out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+
+    if not return_indices:
+        func = check_function("diopiMaxPool1d")
+        ret = func(
+            input.context(),
+            out,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            ceil_mode,
+        )
+        check_returncode(ret)
+        return out
+    else:
+        func = check_function("diopiMaxPool1dWithIndices")
+        nhwc_stride = compute_nhwc_stride_2d(sizeO) if glob_vars.nhwc else None
+        indices = Tensor(
+            sizeO, from_numpy_dtype(glob_vars.int_type), stride=nhwc_stride
+        )
+        ret = func(
+            input.context(),
+            out,
+            indices,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            ceil_mode,
+        )
+        check_returncode(ret)
+        return out, indices
+
+def max_pool1d_backward(
+    input,
+    grad_outputs,
+    kernel_size,
+    stride=None,
+    padding=0,
+    dilation=1,
+    ceil_mode=False,
+    **kwargs,
+) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    grad_input = raw_like(input)
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size,)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride,)
+    if isinstance(padding, int):
+        padding = (padding,)
+    if isinstance(dilation, int):
+        dilation = (dilation,)
+
+    _, indices = max_pool1d(
+        input, kernel_size, stride, padding, dilation, ceil_mode, True
+    )
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+    dilation = Sizes(list(dilation))
+
+    func = check_function("diopiMaxPool1dBackward")
+    ret = func(
+        input.context(),
+        grad_input,
+        grad_outputs[0],
+        input,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode,
+        indices,
+    )
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+def adaptive_avg_pool1d(input, output_size):
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 3:
+        sizeO.append(sizeI[1])
+
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+
+    for i in range(-1, 0):
+        if output_size[i] is None:
+            sizeO.append(sizeI[i])
+        else:
+            sizeO.append(output_size[i])
+
+    nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+    out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+    output_size = Sizes(list([sizeO[-1],]))
+
+    func = check_function("diopiAdaptiveAvgPool1d")
+    ret = func(input.context(), out, input, output_size)
+    check_returncode(ret)
+    return out
+
+
+def adaptive_max_pool1d(input, output_size, return_indices=False):
+    sizeI = input.size().data
+    assert len(sizeI) == 3 or len(sizeI) == 2, "input must be 2d or 3d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 3:
+        sizeO.append(sizeI[1])
+
+    if isinstance(output_size, int):
+        output_size = (output_size,)
+
+    for i in range(-1, 0):
+        if output_size[i] is None:
+            sizeO.append(sizeI[i])
+        else:
+            sizeO.append(output_size[i])
+
+    nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+    out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+    output_size = Sizes(list([sizeO[-1],]))
+
+    if return_indices:
+        func = check_function("diopiAdaptiveMaxPool1dWithIndices")
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        indices = Tensor(
+            sizeO, from_numpy_dtype(glob_vars.int_type), stride=nhwc_stride
+        )
+        ret = func(input.context(), out, indices, input, output_size)
+        check_returncode(ret)
+        return out, indices
+    else:
+        func = check_function("diopiAdaptiveMaxPool1d")
+        ret = func(input.context(), out, input, output_size)
+    check_returncode(ret)
+    return out
+
+def adaptive_avg_pool1d_backward(input, grad_outputs, **kwargs) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    grad_input = raw_like(input)
+    func = check_function("diopiAdaptiveAvgPool1dBackward")
+    ret = func(input.context(), grad_input, grad_outputs[0], input)
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+
+def adaptive_max_pool1d_backward(input, grad_outputs, output_size, **kwargs) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    grad_input = raw_like(input)
+    _, indices = adaptive_max_pool1d(input, output_size, return_indices=True)
+
+    func = check_function("diopiAdaptiveMaxPool1dBackward")
+    ret = func(input.context(), grad_input, grad_outputs[0], input, indices)
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+def pool2d(input, kernel_size=0, stride=None, padding=0, dilation=1, ceil_mode=False, count_include_pad=True, output_size=0, mode="avg", adaptive=False) -> Tensor:
+
+    sizeI = input.size().data
+    assert len(sizeI) == 4 or len(sizeI) == 3, "input must be 3d or 4d tensors"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 4:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation)
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+
+    if mode == "avg" and adaptive == False:
+        for i in range(-2, 0):
+            if ceil_mode:
+                sizeO.append(
+                    math.ceil((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+            else:
+                sizeO.append(
+                    math.floor((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+    
+        func = check_function("diopiPool2d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "max" and adaptive == False:
+        for i in range(-2, 0):
+            tmp_ker_size = kernel_size[i] + (kernel_size[i] - 1) * (dilation[i] - 1)
+            tmp_size = (sizeI[i] - tmp_ker_size + 2 * padding[i]) / stride[i] + 1
+            tmp_size = tmp_size if tmp_size > 1 else 1
+            if ceil_mode:
+                sizeO.append(math.ceil(tmp_size))
+            else:
+                sizeO.append(math.floor(tmp_size))
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)  
+
+        func = check_function("diopiPool2d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "avg" and adaptive == True:
+        for i in range(-2, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-2], sizeO[-1]]))
+    
+        func = check_function("diopiPool2d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    else:
+        for i in range(-2, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-2], sizeO[-1]]))
+    
+        func = check_function("diopiPool2d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    
+        
+def pool2d_backward(input, grad_outputs, kernel_size=0, stride=0, padding=0, dilation=0, ceil_mode=0, count_include_pad=True, mode="avg", adaptive=False, output_size=0, **kwargs,) -> Tensor:
+
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    grad_input = raw_like(input)
 
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation)
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+
+    
+    if mode == "avg" and adaptive == False:
+       
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool2dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == False:
+
+        _, indices = max_pool2d(
+           input, kernel_size, stride, padding, dilation, ceil_mode, True
+        )
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+    
+        func = check_function("diopiPool2dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "avg" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool2dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        _, indices = adaptive_max_pool2d(input, output_size, return_indices=True)
+    
+        func = check_function("diopiPool2dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+        
 def avg_pool2d(
     input,
     kernel_size,
@@ -1478,6 +2417,50 @@ def sort(input, dim=-1, descending=False, stable=None):
     return vals, indices
 
 
+def sort_backward(input, grad_outputs, dim, indice, **kwargs):
+    grad_outputs = grad_outputs[0]
+    grad_input = raw_like(grad_outputs)
+
+    func = check_function("diopiSortBackward")
+    ret = func(input.context(), grad_input, grad_outputs, dim, indice, input.size(), True)
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+def complex(real, imag):
+    out_shape = infer_size(real.size().data, imag.size().data)
+    if real.get_dtype() == Dtype.float64:
+        out = Tensor(out_shape, Dtype.complex128)
+    elif real.get_dtype() == Dtype.float32:
+        out = Tensor(out_shape, Dtype.complex64)
+    func = check_function("diopiComplex")
+    ret = func(real.context(), out, real, imag)
+    check_returncode(ret)
+    return out
+
+def conj(input):
+    out = raw_like(input)
+    func = check_function("diopiConj")
+    ret = func(input.context(), out, input)
+
+    check_returncode(ret)
+    return out
+
+def imag(input):
+    out = raw_like(input)
+    func = check_function("diopiImag")
+    ret = func(input.context(), out, input)
+
+    check_returncode(ret)
+    return out
+
+def real(input):
+    out = raw_like(input)
+    func = check_function("diopiReal")
+    ret = func(input.context(), out, input)
+
+    check_returncode(ret)
+    return out
+
 def topk(input, k, dim=-1, largest=True, sorted=True):
     sizeI = input.size().data
     if len(sizeI) > 0:
@@ -2472,10 +3455,69 @@ def conv2d_backward(
 
     grad_input = raw_like(input)
     grad_weight = raw_like(weight)
-
-    keys = ["input", "weight"]
-    grads = [grad_input, grad_weight]
-    out = {k: v for k, v in zip(keys, grads) if v.requires_grad}
+
+    keys = ["input", "weight"]
+    grads = [grad_input, grad_weight]
+    out = {k: v for k, v in zip(keys, grads) if v.requires_grad}
+
+    if bias is None:
+        grad_bias = None
+        sizeBias = None
+    else:
+        gradBias = raw_like(bias)
+        grad_bias = gradBias
+        sizeBias = bias.size()
+        out.update({"bias": grad_bias})
+
+    func = check_function("diopiConvolution2dBackward")
+    ret = func(
+        input.context(),
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_outputs[0],
+        input,
+        weight,
+        sizeBias,
+        stride,
+        padding,
+        dilation,
+        groups,
+    )
+    check_returncode(ret)
+    return out
+
+
+def conv_transpose2d_backward(
+    input,
+    grad_outputs,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    **kwargs,
+) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    sizeI = input.size().data
+    sizeW = weight.size().data
+    assert len(sizeI) == 4 and len(sizeW) == 4, "input and weight must be 4d tensors"
+
+    if isinstance(stride, int):
+        stride = (stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation)
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    dilation = Sizes(list(dilation))
+
+    grad_input = raw_like(input)
+    grad_weight = raw_like(weight)
+    out = {"input": grad_input, "weight": grad_weight}
 
     if bias is None:
         grad_bias = None
@@ -2486,7 +3528,9 @@ def conv2d_backward(
         sizeBias = bias.size()
         out.update({"bias": grad_bias})
 
-    func = check_function("diopiConvolution2dBackward")
+    output_padding = Sizes(list([0, 0]))
+
+    func = check_function("diopiConvTranspose2dBackward")
     ret = func(
         input.context(),
         grad_input,
@@ -2499,13 +3543,13 @@ def conv2d_backward(
         stride,
         padding,
         dilation,
+        output_padding,
         groups,
     )
     check_returncode(ret)
-    return out
-
+    return {k: v for k, v in out.items() if v.requires_grad}
 
-def conv_transpose2d_backward(
+def conv_transpose3d_backward(
     input,
     grad_outputs,
     weight,
@@ -2514,23 +3558,27 @@ def conv_transpose2d_backward(
     padding=0,
     dilation=1,
     groups=1,
+    output_padding=0,
     **kwargs,
 ) -> Tensor:
     assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
     sizeI = input.size().data
     sizeW = weight.size().data
-    assert len(sizeI) == 4 and len(sizeW) == 4, "input and weight must be 4d tensors"
+    assert len(sizeI) == 5 and len(sizeW) == 5, "input and weight must be 5d tensors"
 
     if isinstance(stride, int):
-        stride = (stride, stride)
+        stride = (stride, stride, stride)
     if isinstance(padding, int):
-        padding = (padding, padding)
+        padding = (padding, padding, padding)
     if isinstance(dilation, int):
-        dilation = (dilation, dilation)
+        dilation = (dilation, dilation, dilation)
+    if isinstance(output_padding, int):
+        output_padding = (output_padding, output_padding, output_padding)
 
     stride = Sizes(list(stride))
     padding = Sizes(list(padding))
     dilation = Sizes(list(dilation))
+    output_padding = Sizes(list(output_padding))
 
     grad_input = raw_like(input)
     grad_weight = raw_like(weight)
@@ -2545,9 +3593,7 @@ def conv_transpose2d_backward(
         sizeBias = bias.size()
         out.update({"bias": grad_bias})
 
-    output_padding = Sizes(list([0, 0]))
-
-    func = check_function("diopiConvTranspose2dBackward")
+    func = check_function("diopiConvTranspose3dBackward")
     ret = func(
         input.context(),
         grad_input,
@@ -2567,6 +3613,7 @@ def conv_transpose2d_backward(
     return {k: v for k, v in out.items() if v.requires_grad}
 
 
+
 def hardtanh_backward(
     input, grad_outputs, min_val=-1.0, max_val=1.0, **kwargs
 ) -> Tensor:
@@ -3385,6 +4432,70 @@ def conv_transpose2d(
     check_returncode(ret)
     return out
 
+def conv_transpose3d(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+) -> Tensor:
+    if bias is not None:
+        assert isinstance(bias, Tensor), "bias must be a Tensor"
+
+    sizeI = input.size().data
+    sizeW = list(weight.size().data)
+    assert len(sizeI) == 5 and len(sizeW) == 5, "input and weight must be 5d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    sizeO.append(sizeW[1] * groups)
+
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding, padding)
+    if isinstance(output_padding, int):
+        output_padding = (output_padding, output_padding, output_padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation, dilation)
+    for i in range(-3, 0):
+        # equivalent kernel size
+        sizeW[i] = (sizeW[i] - 1) * dilation[i]
+        sizeO.append(
+            int(
+                (sizeI[i] - 1) * stride[i]
+                - 2 * padding[i]
+                + sizeW[i]
+                + output_padding[i]
+            )
+            + 1
+        )
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    output_padding = Sizes(list(output_padding))
+    dilation = Sizes(list(dilation))
+
+    out = Tensor(sizeO, input.get_dtype())
+    func = check_function("diopiConvTranspose3d")
+    ret = func(
+        input.context(),
+        out,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        output_padding,
+        groups,
+        dilation,
+    )
+    check_returncode(ret)
+    return out
+
+
 
 def cumsum(input, dim, dtype=None):
     assert isinstance(dim, int), "dim should be int"
@@ -3406,6 +4517,15 @@ def cumsum(input, dim, dtype=None):
     return out
 
 
+def cumsum_backward(input, grad_outputs, dim, **kwargs):
+    grad_output = grad_outputs[0]
+    grad_input = raw_like(input)
+    func = check_function("diopiCumsumBackward")
+    ret = func(input.context(), grad_input, grad_output, dim)
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+
 def infer_size(a, b):
     dimsA = len(a)
     dimsB = len(b)
@@ -3597,6 +4717,37 @@ def argmax(input, dim=None, keepdim=False):
 
     return out
 
+def argmin(input, dim=None, keepdim=False):
+    sizeO = list(input.size().data)
+    if len(sizeO) > 0 and dim is not None:
+        assert dim < len(sizeO), "dim out of index"
+        if keepdim:
+            sizeO[dim] = 1
+        else:
+            sizeO = sizeO[:dim] + sizeO[dim + 1 :]
+    else:
+        sizeO = [1]
+
+    out = Tensor(sizeO, from_numpy_dtype(glob_vars.int_type))
+    func = check_function("diopiArgmin")
+    # todo: check the reason of using keepdim
+    ret = (
+        func(input.context(), out, input, keepdim)
+        if dim is None
+        else func(input.context(), out, input, dim, keepdim)
+    )
+    check_returncode(ret)
+
+    return out
+
+def argsort(input, dim=-1, descending=False, stable=False):
+    out = Tensor(input.size().data, from_numpy_dtype(glob_vars.int_type))
+    func = check_function("diopiArgsort")
+    ret = func(input.context(), out, input, stable, dim, descending)
+    check_returncode(ret)
+
+    return out
+
 
 def smooth_l1_loss(input, target, reduction="mean", beta=1.0):
     assert (
@@ -3932,8 +5083,34 @@ def norm(input, p, dim=None, keepdim=False, dtype=None):
     func = check_function("diopiNorm")
     ret = func(input.context(), out, input, p, dim)
     check_returncode(ret)
+    
+    GLOBAL_STATE["norm"] = out
     return out
 
+def norm_backward(grad_outputs, input, p, dim, keepdim=False, dtype=None):
+    if p == 0:
+        return {'input': None}
+    else: 
+        grad_input = raw_like(input)
+
+    p = Scalar(p)
+
+    dim, _ = reduce_op_process(input, dim, keepdim, dtype)
+
+    dim = Sizes(list(dim))
+
+    grad_output = grad_outputs[0]
+
+    out = {"input": grad_input}
+    
+    func = check_function("diopiNormBackward")
+
+    norm = GLOBAL_STATE.pop("norm")
+    ret = func(input.context(), grad_input, grad_output, input, norm, dim, p)
+    check_returncode(ret)
+
+    return {k: v for k, v in out.items() if v.requires_grad}
+
 
 def group_norm(input, num_groups, weight=None, bias=None, eps=1e-05):
     dim = list(input.size().data)
@@ -3997,6 +5174,62 @@ def group_norm_backward(
     check_returncode(ret)
     return {k: v for k, v in out.items() if v.requires_grad}
 
+def instance_norm(input, axis=2, weight=None, bias=None, eps=1e-05):
+    weight = None if weight is None else weight
+    bias = None if bias is None else bias
+
+    out = raw_like(input)
+    func = check_function("diopiInstanceNorm")
+    ret = func(
+        input.context(),
+        out,
+        input,
+        axis,
+        weight,
+        bias,
+        eps,
+    )
+    check_returncode(ret)
+    return out
+
+def instance_norm_backward(input, grad_outputs, axis=2, weight=None, bias=None, eps=1e-05, **kwargs):
+    grad_input = raw_like(input)
+    out = {"input": grad_input}
+
+    if weight is None:
+        weight = None
+        grad_weight_capsule = None
+    else:
+        grad_weight = raw_like(weight)
+        weight = weight
+        grad_weight_capsule = grad_weight
+        out["weight"] = grad_weight
+
+    if bias is None:
+        bias = None
+        grad_bias_capsule = None
+    else:
+        grad_bias = raw_like(bias)
+        bias = bias
+        grad_bias_capsule = grad_bias
+        out["bias"] = grad_bias
+
+    func = check_function("diopiInstanceNormBackward")
+    ret = func(
+        input.context(),
+        grad_input,
+        grad_weight_capsule,
+        grad_bias_capsule,
+        grad_outputs[0],
+        input,
+        weight,
+        bias,
+        axis,
+        eps,
+    )
+    check_returncode(ret)
+    return {k: v for k, v in out.items() if v.requires_grad}
+
 
 def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
     sizeI = input.size().data
@@ -4008,26 +5241,107 @@ def layer_norm(input, normalized_shape, weight=None, bias=None, eps=1e-05):
     weight = None if weight is None else weight
     bias = None if bias is None else bias
 
-    out = raw_like(input)
-    func = check_function("diopiLayerNorm")
+    out = raw_like(input)
+    func = check_function("diopiLayerNorm")
+    ret = func(
+        input.context(),
+        out,
+        save_mean,
+        save_invstd,
+        input,
+        weight,
+        bias,
+        Sizes(normalized_shape),
+        eps,
+    )
+    check_returncode(ret)
+    GLOBAL_STATE["layer_norm_save_mean"] = save_mean
+    GLOBAL_STATE["layer_norm_save_invstd"] = save_invstd
+    return out
+
+def layer_normGB(input, normalized_shape, weight=None, bias=None, eps=1e-05):
+    sizeI = input.size().data
+    dims = len(sizeI) - len(normalized_shape)
+    size = [i for i in sizeI[0:dims]]
+    save_mean = Tensor(size, input.get_dtype())
+    save_invstd = raw_like(save_mean)
+
+    weight = None if weight is None else weight
+    bias = None if bias is None else bias
+
+    out = raw_like(input)
+
+    # Note that, this is the national standard GB operator version, which is different from the diopiLayerNorm interface definition, normalized_shape has changed to begin_norm_axis.
+    func = check_function("diopiLayerNormGB")
+    ret = func(
+        input.context(),
+        out,
+        save_mean,
+        save_invstd,
+        input,
+        weight,
+        bias,
+        eps,
+        dims
+    )
+    check_returncode(ret)
+    GLOBAL_STATE["layer_norm_save_mean"] = save_mean
+    GLOBAL_STATE["layer_norm_save_invstd"] = save_invstd
+    return out
+
+   
+
+def layer_norm_backward(
+    input,
+    grad_outputs,
+    normalized_shape,
+    weight=None,
+    bias=None,
+    eps=1e-05,
+    **kwargs,
+) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    save_mean = GLOBAL_STATE.pop("layer_norm_save_mean")
+    save_invstd = GLOBAL_STATE.pop("layer_norm_save_invstd")
+    grad_input = raw_like(input)
+    out = {"input": grad_input}
+
+    if weight is None:
+        weight = None
+        grad_weight_capsule = None
+    else:
+        grad_weight = raw_like(weight)
+        weight = weight
+        grad_weight_capsule = grad_weight
+        out["weight"] = grad_weight
+
+    if bias is None:
+        bias = None
+        grad_bias_capsule = None
+    else:
+        grad_bias = raw_like(bias)
+        bias = bias
+        grad_bias_capsule = grad_bias
+        out["bias"] = grad_bias
+
+    func = check_function("diopiLayerNormBackward")
     ret = func(
         input.context(),
-        out,
-        save_mean,
-        save_invstd,
+        grad_input,
+        grad_weight_capsule,
+        grad_bias_capsule,
+        grad_outputs[0],
         input,
         weight,
         bias,
+        save_mean,
+        save_invstd,
         Sizes(normalized_shape),
-        eps,
     )
     check_returncode(ret)
-    GLOBAL_STATE["layer_norm_save_mean"] = save_mean
-    GLOBAL_STATE["layer_norm_save_invstd"] = save_invstd
-    return out
-
+    return {k: v for k, v in out.items() if v.requires_grad}
 
-def layer_norm_backward(
+def layer_normGB_backward(
     input,
     grad_outputs,
     normalized_shape,
@@ -4042,6 +5356,9 @@ def layer_norm_backward(
     grad_input = raw_like(input)
     out = {"input": grad_input}
 
+    sizeI = input.size().data
+    dim = len(sizeI) - len(normalized_shape)
+
     if weight is None:
         weight = None
         grad_weight_capsule = None
@@ -4060,7 +5377,7 @@ def layer_norm_backward(
         grad_bias_capsule = grad_bias
         out["bias"] = grad_bias
 
-    func = check_function("diopiLayerNormBackward")
+    func = check_function("diopiLayerNormGBBackward")
     ret = func(
         input.context(),
         grad_input,
@@ -4072,11 +5389,477 @@ def layer_norm_backward(
         bias,
         save_mean,
         save_invstd,
-        Sizes(normalized_shape),
+        dim
+    )
+    check_returncode(ret)
+    return {k: v for k, v in out.items() if v.requires_grad}
+
+def normalize(input, p, dim, eps):
+    output = raw_like(input)
+
+    func = check_function("diopiNormalize")
+
+    ret = func(
+        input.context(),
+        output,
+        input,
+        p,
+        dim,
+        eps
     )
+
     check_returncode(ret)
+
+    return output
+
+def normalize_backward(grad_outputs, input, p, dim, eps):
+
+    grad_output = grad_outputs[0]
+    
+    func = check_function("diopiNormalizeBackward")
+
+    grad_input = raw_like(input)
+
+    out = {'input': grad_input}
+
+    ret = func(
+        input.context(),
+        grad_input,
+        grad_output,
+        input,
+        p,
+        dim,
+        eps
+
+    )
     return {k: v for k, v in out.items() if v.requires_grad}
 
+def pool3d(input, kernel_size=0, stride=None, padding=0, dilation=1, ceil_mode=False, count_include_pad=True, output_size=0, mode="avg", adaptive=False) -> Tensor:
+
+    sizeI = input.size().data
+    assert len(sizeI) == 5 or len(sizeI) == 4, "input must be 4d or 5d tensors"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 5:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding, padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation, dilation)
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+
+    if mode == "avg" and adaptive == False:
+        for i in range(-3, 0):
+            if ceil_mode:
+                sizeO.append(
+                    math.ceil((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+            else:
+                sizeO.append(
+                    math.floor((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+                )
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+    
+        func = check_function("diopiPool3d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "max" and adaptive == False:
+        for i in range(-3, 0):
+            tmp_ker_size = kernel_size[i] + (kernel_size[i] - 1) * (dilation[i] - 1)
+            tmp_size = (sizeI[i] - tmp_ker_size + 2 * padding[i]) / stride[i] + 1
+            tmp_size = tmp_size if tmp_size > 1 else 1
+            if ceil_mode:
+                sizeO.append(math.ceil(tmp_size))
+            else:
+                sizeO.append(math.floor(tmp_size))
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        output_size = Sizes(list(output_size))
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)  
+
+        func = check_function("diopiPool3d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                output_size,
+        )
+        check_returncode(ret)
+        return out
+    elif mode == "avg" and adaptive == True:
+        for i in range(-3, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-3], sizeO[-2], sizeO[-1]]))
+    
+        func = check_function("diopiPool3d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    else:
+        for i in range(-3, 0):
+            if output_size[i] is None:
+                sizeO.append(sizeI[i])
+            else:
+                sizeO.append(output_size[i])
+    
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        nhwc_stride = compute_nhwc_stride_1d(sizeO) if glob_vars.nhwc else None
+        out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+        output_size = Sizes(list([sizeO[-3], sizeO[-2], sizeO[-1]]))
+    
+        func = check_function("diopiPool3d")
+        ret = func(
+                input.context(),
+                out,
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                output_size,
+        )
+ 
+        check_returncode(ret)
+        return out
+    
+        
+def pool3d_backward(input, grad_outputs, kernel_size=0, stride=0, padding=0, dilation=0, ceil_mode=0, count_include_pad=True, mode="avg", adaptive=False, output_size=0, **kwargs,) -> Tensor:
+
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    assert mode in ["avg", "max"] and adaptive in [True, False], "mode or adaptive has wrong type"
+    grad_input = raw_like(input)
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding, padding)
+    if isinstance(dilation, int):
+        dilation = (dilation, dilation, dilation)
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size, output_size)
+
+    
+    if mode == "avg" and adaptive == False:
+       
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+        
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool3dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == False:
+
+        _, indices = max_pool3d(
+           input, kernel_size, stride, padding, dilation, ceil_mode, True
+        )
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+    
+        func = check_function("diopiPool3dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                False,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "avg" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        indices = raw_like(input)
+    
+        func = check_function("diopiPool3dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "avg",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+    elif mode == "max" and adaptive == True:
+        stride = Sizes(list(stride))
+        padding = Sizes(list(padding))
+        kernel_size = Sizes(list(kernel_size))
+        dilation = Sizes(list(dilation))
+
+        _, indices = adaptive_max_pool3d(input, output_size, return_indices=True)
+    
+        func = check_function("diopiPool3dBackward")
+        ret = (
+            func(
+                input.context(),
+                grad_input,
+                grad_outputs[0],
+                input,
+                "max",
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                ceil_mode,
+                not count_include_pad,
+                True,
+                indices, 
+            )
+        )
+        check_returncode(ret)
+        return {"input": grad_input} if grad_input.requires_grad else {}
+        
+def avg_pool3d(
+    input,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+) -> Tensor:
+    sizeI = input.size().data
+    assert len(sizeI) == 5 or len(sizeI) == 4, "input must be 4d or 5d tensors"
+
+    sizeO = []
+    sizeO.append(sizeI[0])
+    if len(sizeI) == 5:
+        sizeO.append(sizeI[1])
+
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding, padding)
+
+    for i in range(-3, 0):
+        if ceil_mode:
+            sizeO.append(
+                math.ceil((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+            )
+        else:
+            sizeO.append(
+                math.floor((sizeI[i] - kernel_size[i] + 2 * padding[i]) / stride[i]) + 1
+            )
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+    nhwc_stride = compute_nhwc_stride_3d(sizeO) if glob_vars.nhwc else None
+    out = Tensor(sizeO, input.get_dtype(), stride=nhwc_stride)
+
+    func = check_function("diopiAvgPool3d")
+    if divisor_override:
+        ret = func(
+            input.context(),
+            out,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+    else:
+        ret = func(
+            input.context(),
+            out,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+        )
+    check_returncode(ret)
+    return out
+
+def avg_pool3d_backward(
+    input,
+    grad_outputs,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+    **kwargs,
+) -> Tensor:
+    assert len(grad_outputs) == 1, "only accept 1 gradient to do backward"
+    grad_input = raw_like(input)
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+    if stride is None:
+        stride = kernel_size
+    if isinstance(stride, int):
+        stride = (stride, stride, stride)
+    if isinstance(padding, int):
+        padding = (padding, padding, padding)
+
+    stride = Sizes(list(stride))
+    padding = Sizes(list(padding))
+    kernel_size = Sizes(list(kernel_size))
+
+    func = check_function("diopiAvgPool3dBackward")
+    ret = (
+        func(
+            input.context(),
+            grad_input,
+            grad_outputs[0],
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+        if divisor_override
+        else func(
+            input.context(),
+            grad_input,
+            grad_outputs[0],
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+        )
+    )
+    check_returncode(ret)
+    return {"input": grad_input} if grad_input.requires_grad else {}
+
+
 
 def adaptive_avg_pool3d(input, output_size):
     sizeI = input.size().data
@@ -5047,6 +6830,16 @@ def meshgrid(tensors, shape=None):
     check_returncode(ret)
     return out
 
+def grid_sample(input, grid, mode="bilinear"):
+    if len(input.size().data) == 4:
+        out = Tensor(size=(input.size().data[0], input.size().data[1], grid.size().data[1], grid.size().data[2],), dtype=input.dtype())
+    else:
+        out = Tensor(size=(input.size().data[0], input.size().data[1], grid.size().data[1], grid.size().data[2], grid.size().data[3],), dtype=input.dtype())
+    func = check_function("diopiGridSample")
+    ret = func(input.context(), out, input, grid, mode)
+    check_returncode(ret)
+    return out
+
 
 def cast_dtype(input, out) -> Tensor:
     call = "diopiCastDtype"
diff --git a/diopi_test/python/conformance/diopi_runtime.py b/diopi_test/python/conformance/diopi_runtime.py
index 5b319ec7a..c5f5826b6 100644
--- a/diopi_test/python/conformance/diopi_runtime.py
+++ b/diopi_test/python/conformance/diopi_runtime.py
@@ -138,6 +138,19 @@ def to_numpy_dtype(dtype: Dtype) -> np.dtype:
 def is_dtype(dtype) -> bool:
     return isinstance(dtype, Dtype)
 
+def compute_nhwc_stride_1d(sizes, itemsize=1):
+   dim = len(sizes) 
+   strides = [itemsize for i in range(dim)]
+   assert dim == 2 or dim == 3, "not supported dim"
+   if dim == 2:
+      strides[0] = itemsize
+      strides[1] = strides[0] * sizes[0]
+   elif dim == 3:
+      strides[1] = itemsize
+      strides[2] = strides[0] * sizes[1]
+      strides[0] = strides[2] * sizes[2]
+   return strides
+
 
 def compute_nhwc_stride_2d(sizes, itemsize=1):
     dim = len(sizes)
diff --git a/impl/torch/functions/functions.cpp b/impl/torch/functions/functions.cpp
index c348ff5d8..6ee8e104e 100644
--- a/impl/torch/functions/functions.cpp
+++ b/impl/torch/functions/functions.cpp
@@ -102,6 +102,203 @@ diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHandle_t inp
     return diopiSuccess;
 }
 
+diopiError_t diopiMaxPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride,
+                            diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation);
+    bool atCeilMode = ceil_mode;
+    auto atOut = CALL_ATEN_FUNC(max_pool1d, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiMaxPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                    diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation,
+                                    bool ceil_mode, diopiConstTensorHandle_t indices) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation);
+    auto atIndices = impl::aten::buildATen(indices);
+
+    auto atGrad2d = CALL_ATEN_FUNC(max_pool2d_with_indices_backward,
+                                   atGradOutput.unsqueeze(-2),
+                                   atInput.unsqueeze(-2),
+                                   {1, atKernelSize[0]},
+                                   {1, atStride[0]},
+                                   {0, atPadding[0]},
+                                   {1, atDilation[0]},
+                                   ceil_mode,
+                                   atIndices.unsqueeze(-2));
+
+    auto atGradInput = atGrad2d.squeeze(-2);
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiMaxPool1dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input,
+                                       diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation);
+    auto atOut = impl::aten::buildATen(out);
+    auto atIndices = impl::aten::buildATen(indices);
+    bool atCeilMode = ceil_mode;
+    std::tuple<at::Tensor, at::Tensor> atRes = CALL_ATEN_FUNC(max_pool1d_with_indices, atInput, atKernelSize, atStride, atPadding, atDilation, atCeilMode);
+
+    impl::aten::updateATen2Tensor(ctx, std::get<0>(atRes), out);
+    impl::aten::updateATen2Tensor(ctx, std::get<1>(atRes), indices);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAvgPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride,
+                            diopiSize_t padding, bool ceil_mode, bool count_include_pad) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    auto atOut = CALL_ATEN_FUNC(avg_pool1d, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad);
+
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAvgPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                    diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode,
+                                    bool count_include_pad) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+
+    auto atGrad2d = CALL_ATEN_FUNC(avg_pool2d_backward,
+                                   atGradOutput.unsqueeze(-2),
+                                   atInput.unsqueeze(-2),
+                                   {1, atKernelSize[0]},
+                                   {1, atStride[0]},
+                                   {0, atPadding[0]},
+                                   ceil_mode,
+                                   count_include_pad,
+                                   c10::nullopt);
+
+    auto atGradInput = atGrad2d.squeeze(-2);
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAdaptiveMaxPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOutSize = impl::aten::buildAtIntArray(output_size);
+    auto atOuts = at::adaptive_max_pool1d(atInput, atOutSize);
+    impl::aten::updateATen2Tensor(ctx, std::get<0>(atOuts), out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAdaptiveMaxPool1dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input,
+                                               diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOutSize = impl::aten::buildAtIntArray(output_size);
+    auto [atOut, atIndices] = CALL_ATEN_FUNC(adaptive_max_pool1d, atInput, atOutSize);
+
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+    impl::aten::updateATen2Tensor(ctx, atIndices, indices);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAdaptiveMaxPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                            diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atIndices = impl::aten::buildATen(indices);
+    auto atGrad2d = CALL_ATEN_FUNC(adaptive_max_pool2d_backward, atGradOutput.unsqueeze(-2), atInput.unsqueeze(-2), atIndices.squeeze(-2));
+    auto atGradInput = atGrad2d.squeeze(-2);
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAdaptiveAvgPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                            diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    auto atGrad2d = CALL_ATEN_FUNC(_adaptive_avg_pool2d_backward, atGradOutput.unsqueeze(-2), atInput.unsqueeze(-2));
+    auto atGradInput = atGrad2d.squeeze(-2);
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAdaptiveAvgPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOutSize = impl::aten::buildAtIntArray(output_size);
+    auto atOut = CALL_ATEN_FUNC(adaptive_avg_pool1d, atInput, atOutSize);
+
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                         diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                         diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool1d(ctx, out, input, ksize, stride, padding, dilation, ceil_mode);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool1d(ctx, out, input, ksize, stride, padding, ceil_mode, !exclusive);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool1d(ctx, out, input, output_size);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool1d(ctx, out, input, output_size);
+    }
+}
+
+diopiError_t diopiPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                 const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode,
+                                 const bool exclusive, const bool adaptive, diopiConstTensorHandle_t indices) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool1dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, dilation, ceil_mode, indices);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool1dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, ceil_mode, !exclusive);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool1dBackward(ctx, grad_input, grad_output, input, indices);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool1dBackward(ctx, grad_input, grad_output, input);
+    }
+}
+
 diopiError_t diopiMaxPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride,
                             diopiSize_t padding, diopiSize_t dilation, bool ceil_mode) {
     impl::aten::setCurStream(ctx);
@@ -133,6 +330,29 @@ diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHand
     return diopiSuccess;
 }
 
+// TODO
+diopiError_t diopiPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t kernel_size,
+                         diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode, bool exclusive, bool adaptive) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    at::IntArrayRef atDilation = impl::aten::buildAtIntArray(dilation);
+    bool atCeilMode = ceil_mode;
+    at::Tensor atOut = {};
+    if (strcmp(mode, "max") == 0 && adaptive) {
+    }
+    if (strcmp(mode, "max") == 0 && !adaptive) {
+    }
+    if (strcmp(mode, "avg") == 0 && adaptive) {
+    }
+    if (strcmp(mode, "avg") == 0 && !adaptive) {
+    }
+
+    return diopiSuccess;
+}
+
 /**
  * @brief
  * @param rounding_mode supported in pytorch>=1.8
@@ -755,6 +975,60 @@ diopiError_t diopiSort(diopiContextHandle_t ctx, diopiTensorHandle_t values, dio
     return diopiSuccess;
 }
 
+diopiError_t diopiSortBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, int64_t dim,
+                               diopiConstTensorHandle_t indices, diopiSize_t sizes, bool keepdim = 1) {
+    impl::aten::setCurStream(ctx);
+    auto atGrad_input = impl::aten::buildATen(grad_input);
+    auto atGrad_output = impl::aten::buildATen(grad_output);
+    auto atIndices = impl::aten::buildATen(indices);
+    auto atSizes = impl::aten::buildAtIntArray(sizes);
+    atGrad_input = CALL_ATEN_FUNC(value_selecting_reduction_backward, atGrad_output, dim, atIndices, atSizes, keepdim);
+    impl::aten::updateATen2Tensor(ctx, atGrad_input, grad_input);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiComplex(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t real, diopiConstTensorHandle_t imag) {
+    impl::aten::setCurStream(ctx);
+    auto atReal = impl::aten::buildATen(real);
+    auto atImag = impl::aten::buildATen(imag);
+    auto atOut = impl::aten::buildATen(out);
+    atOut = torch::complex(atReal, atImag);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiConj(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    atOut = torch::conj(atInput);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiImag(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    atOut = torch::imag(atInput);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiReal(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    atOut = torch::real(atInput);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiTopk(diopiContextHandle_t ctx, diopiTensorHandle_t values, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, int64_t k,
                        int64_t dim, bool largest, bool sorted) {
     impl::aten::setCurStream(ctx);
@@ -830,6 +1104,40 @@ diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
     return diopiSuccess;
 }
 
+diopiError_t diopiAcos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(acos_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAcosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(acos_, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiTan(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(tan_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiTanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(tan_, atInput);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiAbs(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -924,6 +1232,40 @@ diopiError_t diopiSign(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC
     return diopiSuccess;
 }
 
+diopiError_t diopiSinh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(sinh_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiSinhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(sinh_, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiCosh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(cosh_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiCoshInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(cosh_, atInput);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiTanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -958,6 +1300,57 @@ diopiError_t diopiAtanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
     return diopiSuccess;
 }
 
+diopiError_t diopiAsinh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(asinh_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAsinhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(asinh_, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAcosh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(acosh_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAcoshInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(acosh_, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAtanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(atanh_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAtanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(atanh_, atInput);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiSigmoid(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -1018,6 +1411,23 @@ diopiError_t diopiExpInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
     return diopiSuccess;
 }
 
+diopiError_t diopiExpm1(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(expm1_out, atOut, atInput);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiExpm1Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    CALL_ATEN_CUDA_FUNC(expm1_, atInput);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiLog(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -1900,6 +2310,38 @@ diopiError_t diopiAvgPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, d
     return diopiSuccess;
 }
 
+diopiError_t diopiPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                         diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                         diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool2d(ctx, out, input, ksize, stride, padding, dilation, ceil_mode);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool2d(ctx, out, input, ksize, stride, padding, ceil_mode, !exclusive, nullptr);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool2d(ctx, out, input, output_size);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool2d(ctx, out, input, output_size);
+    }
+}
+
+diopiError_t diopiPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                 const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode,
+                                 const bool exclusive, const bool adaptive, diopiConstTensorHandle_t indices) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool2dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, dilation, ceil_mode, indices);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool2dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, ceil_mode, !exclusive, nullptr);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool2dBackward(ctx, grad_input, grad_output, input, indices);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool2dBackward(ctx, grad_input, grad_output, input);
+    }
+}
+
 diopiError_t diopiDropout(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t mask, diopiConstTensorHandle_t input, double p, bool train,
                           diopiGeneratorHandle_t generator) {
     impl::aten::setCurStream(ctx);
@@ -2246,7 +2688,79 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan
     auto grad_input_mask = std::array<bool, 3>{true, true, false};
     auto atOut = CALL_ATEN_FUNC(miopen_convolution_backward, atInput, atGrad, atWeight, atPadding, atStride, atDilation, groups, false, false, grad_input_mask);
     updateATen2Tensor(ctx, atOut, vecOut);
-    if (bias_sizes && grad_bias) {
+    if (bias_sizes && grad_bias) {
+        auto atGradBias = impl::aten::buildATen(grad_bias);
+        at::Tensor atTmp = atGrad;
+        int64_t size = atGrad.dim() - 1;
+        while (atGradBias.dim() != size) {
+            atTmp = at::sum(atTmp, -1, false);
+            size -= 1;
+        }
+        CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false);
+    }
+#else
+    std::vector<int64_t> outputPadding(padding.len, 0);
+    if (grad_input && grad_weight && grad_bias && bias_sizes) {
+        // TODO(ywt): when pytorch fix the bug of empty tensor, remove the
+        // check of grad_input && grad_weight
+        auto atBiasSizes = impl::aten::buildAtIntArray(bias_sizes);
+        auto atGradInput = impl::aten::buildATen(grad_input);
+        auto atGradWeight = impl::aten::buildATen(grad_weight);
+        auto atGradBias = impl::aten::buildATen(grad_bias);
+        auto tempOut = CALL_ATEN_CUDA_FUNC(
+            convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, true});
+        at::native::copy_(atGradInput, std::get<0>(tempOut), true);
+        at::native::copy_(atGradWeight, std::get<1>(tempOut), true);
+        at::native::copy_(atGradBias, std::get<2>(tempOut), true);
+    } else {
+        auto results = at::convolution_backward(
+            atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, false});
+        impl::aten::updateATen2Tensor(ctx, std::get<0>(results), grad_input);
+        impl::aten::updateATen2Tensor(ctx, std::get<1>(results), grad_weight);
+        if (bias_sizes && grad_bias) {
+            auto atGradBias = impl::aten::buildATen(grad_bias);
+            at::Tensor atTmp = atGrad;
+            int64_t size = atGrad.dim() - 1;
+            while (atGradBias.dim() != size) {
+                atTmp = at::sum(atTmp, -1, false);
+                size -= 1;
+            }
+            CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false);
+        }
+    }
+#endif
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight,
+                                          diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                          diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding,
+                                          diopiSize_t dilation, diopiSize_t output_padding, int64_t groups) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atGrad = impl::aten::buildATen(grad_output);
+    auto atWeight = impl::aten::buildATen(weight);
+    auto atStride = impl::aten::buildAtIntArray(stride);
+    auto atPadding = impl::aten::buildAtIntArray(padding);
+    auto atOutputPadding = impl::aten::buildAtIntArray(output_padding);
+    auto atDilation = impl::aten::buildAtIntArray(dilation);
+#ifdef USE_HIP
+    auto grad_input_mask = std::array<bool, 3>{true, true, false};
+    auto atOut = CALL_ATEN_FUNC(miopen_convolution_transpose_backward,
+                                atInput,
+                                atGrad,
+                                atWeight,
+                                atPadding,
+                                atOutputPadding,
+                                atStride,
+                                atDilation,
+                                groups,
+                                false,
+                                false,
+                                grad_input_mask);
+    updateATen2Tensor(ctx, atOut, vecOut);
+    if (bias_sizes != nullptr && grad_bias != nullptr) {
         auto atGradBias = impl::aten::buildATen(grad_bias);
         at::Tensor atTmp = atGrad;
         int64_t size = atGrad.dim() - 1;
@@ -2257,25 +2771,22 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan
         CALL_ATEN_CUDA_FUNC(sum_out, atGradBias, atTmp, 0, false);
     }
 #else
-    std::vector<int64_t> outputPadding(padding.len, 0);
     if (grad_input && grad_weight && grad_bias && bias_sizes) {
-        // TODO(ywt): when pytorch fix the bug of empty tensor, remove the
-        // check of grad_input && grad_weight
         auto atBiasSizes = impl::aten::buildAtIntArray(bias_sizes);
         auto atGradInput = impl::aten::buildATen(grad_input);
         auto atGradWeight = impl::aten::buildATen(grad_weight);
         auto atGradBias = impl::aten::buildATen(grad_bias);
         auto tempOut = CALL_ATEN_CUDA_FUNC(
-            convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, true});
+            convolution_backward, atGrad, atInput, atWeight, atBiasSizes, atStride, atPadding, atDilation, true, atOutputPadding, groups, {true, true, true});
         at::native::copy_(atGradInput, std::get<0>(tempOut), true);
         at::native::copy_(atGradWeight, std::get<1>(tempOut), true);
         at::native::copy_(atGradBias, std::get<2>(tempOut), true);
     } else {
-        auto results = at::convolution_backward(
-            atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, false, outputPadding, groups, {true, true, false});
-        impl::aten::updateATen2Tensor(ctx, std::get<0>(results), grad_input);
-        impl::aten::updateATen2Tensor(ctx, std::get<1>(results), grad_weight);
-        if (bias_sizes && grad_bias) {
+        auto grad_inputs = at::convolution_backward(
+            atGrad, atInput, atWeight, c10::nullopt, atStride, atPadding, atDilation, true, atOutputPadding, groups, {true, true, false});
+        impl::aten::updateATen2Tensor(ctx, std::get<0>(grad_inputs), grad_input);
+        impl::aten::updateATen2Tensor(ctx, std::get<1>(grad_inputs), grad_weight);
+        if (bias_sizes != nullptr && grad_bias != nullptr) {
             auto atGradBias = impl::aten::buildATen(grad_bias);
             at::Tensor atTmp = atGrad;
             int64_t size = atGrad.dim() - 1;
@@ -2291,7 +2802,7 @@ diopiError_t diopiConvolution2dBackward(diopiContextHandle_t ctx, diopiTensorHan
     return diopiSuccess;
 }
 
-diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight,
+diopiError_t diopiConvTranspose3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight,
                                           diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
                                           diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding,
                                           diopiSize_t dilation, diopiSize_t output_padding, int64_t groups) {
@@ -2832,6 +3343,21 @@ diopiError_t diopiMeshGrid(diopiContextHandle_t ctx, diopiTensorHandle_t* outs,
     return diopiSuccess;
 }
 
+diopiError_t diopiGridSample(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t grid,
+                             const char* mode) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atGrid = impl::aten::buildATen(grid);
+    auto atOut = impl::aten::buildATen(out);
+    int interpolation_mode = 0;
+    if (strcmp(mode, "bilinear") != 0) {
+        interpolation_mode = 1;
+    }
+    atOut = CALL_ATEN_FUNC(grid_sampler, atInput, atGrid, interpolation_mode, 0, 0);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+    return diopiSuccess;
+}
+
 diopiError_t diopiFusedAdamW(diopiContextHandle_t ctx, diopiTensorHandle_t* params, diopiConstTensorHandle_t* grads, diopiTensorHandle_t* exp_avgs,
                              diopiTensorHandle_t* exp_avg_sqs, diopiTensorHandle_t* max_exp_avg_sqs, diopiConstTensorHandle_t* state_steps, int64_t nums,
                              float lr, float beta1, float beta2, float eps, float weight_decay, bool amsgrad, bool maximize) {
@@ -2903,7 +3429,8 @@ diopiError_t diopiAdam(diopiContextHandle_t ctx, diopiTensorHandle_t param, diop
     if (weight_decay != 0) {
         grad_d = grad_d.add(atParam, weight_decay);
     }
-    atExpAvg.mul_(beta1).add_(grad_d, 1 - beta1);
+    // atExpAvg.mul_(beta1).add_(grad_d, 1 - beta1);
+    atExpAvg.lerp_(grad_d, 1 - beta1);
     atExpAvgSq.mul_(beta2).addcmul_(grad_d, grad_d.conj(), 1 - beta2);
 
     at::Tensor denom;
@@ -2997,6 +3524,23 @@ diopiError_t diopiConvTranspose2d(diopiContextHandle_t ctx, diopiTensorHandle_t
     return diopiSuccess;
 }
 
+diopiError_t diopiConvTranspose3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
+                                  diopiConstTensorHandle_t bias, diopiSize_t stride, diopiSize_t padding, diopiSize_t output_padding, int64_t groups,
+                                  diopiSize_t dilation) {
+    impl::aten::setCurStream(ctx);
+    auto atInput = impl::aten::buildATen(input);
+    auto atWeight = impl::aten::buildATen(weight);
+    auto atBias = impl::aten::buildATen(bias);
+    auto atStride = impl::aten::buildAtIntArray(stride);
+    auto atPadding = impl::aten::buildAtIntArray(padding);
+    auto atOutputPadding = impl::aten::buildAtIntArray(output_padding);
+    auto atDilation = impl::aten::buildAtIntArray(dilation);
+    auto atOut = CALL_ATEN_FUNC(conv_transpose3d, atInput, atWeight, atBias, atStride, atPadding, atOutputPadding, groups, atDilation);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -3006,6 +3550,15 @@ diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop
     return diopiSuccess;
 }
 
+diopiError_t diopiCumsumBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, int64_t dim) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atGradInput = atGradOutput;
+    impl::aten::updateATen2Tensor(ctx, atGradInput.flip(dim).cumsum(dim).flip(dim), grad_input);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiCdist(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input1, diopiConstTensorHandle_t input2, double p,
                         const int64_t* compute_mode) {
     impl::aten::setCurStream(ctx);
@@ -3092,6 +3645,26 @@ diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diop
     return diopiSuccess;
 }
 
+diopiError_t diopiArgmin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim, bool keepdim) {
+    impl::aten::setCurStream(ctx);
+    auto atOut = impl::aten::buildATen(out);
+    auto atInput = impl::aten::buildATen(input);
+    c10::optional<int64_t> atDim = dim ? c10::optional<int64_t>(*dim) : c10::nullopt;
+    CALL_ATEN_CUDA_FUNC(argmin_out, atOut, atInput, atDim, keepdim);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiArgsort(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, bool stable, const int64_t* dim, bool descending) {
+    impl::aten::setCurStream(ctx);
+    auto atOut = impl::aten::buildATen(out);
+    auto atInput = impl::aten::buildATen(input);
+    atOut = CALL_ATEN_CUDA_FUNC(argsort, atInput, stable, (dim ? *dim : -1), descending);
+    impl::aten::updateATen2Tensor(ctx, atOut, out);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiSmoothL1Loss(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t target,
                                diopiReduction_t reduction, double beta) {
     impl::aten::setCurStream(ctx);
@@ -3372,6 +3945,98 @@ diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiC
     return diopiSuccess;
 }
 
+at::Tensor unsqueeze_multiple(const at::Tensor& t, at::OptionalIntArrayRef opt_dim, size_t n_dims) {
+    if (opt_dim.has_value()) {
+        at::IntArrayRef dim = opt_dim.value();
+        auto dim_size = dim.size();
+        // Optimisation for two common cases
+        if (dim_size == 0) {
+            return t;
+        } else if (dim_size == 1) {
+            return t.unsqueeze(dim[0]);
+        }
+    }
+    auto dims_to_unsqueeze = at::dim_list_to_bitset(opt_dim, n_dims);
+    at::Tensor res = t;
+    for (const auto i : c10::irange(n_dims)) {
+        if (dims_to_unsqueeze[i]) {
+            res = res.unsqueeze(i);
+        }
+    }
+    return res;
+}
+
+at::Tensor norm_backward(at::Tensor grad, const at::Tensor& self, const std::optional<at::Scalar>& p_, at::Tensor norm, at::IntArrayRef dim, bool keepdim) {
+    // NB: We mask fill the NaNs in the output to be zero but still do float
+    // division
+    //     by zero, which ASAN complains about. One way to appease ASAN is to fill
+    //     the problematic values with something arbitrary before the division,
+    //     but we decide not to due to the perf hit. Instead we just silence ASAN
+    //     where necessary
+    size_t ndim = self.dim();
+    double p = p_.value_or(2.0).toDouble();
+    at::Tensor self_scaled;
+    at::Tensor scale_v;
+
+    if (!keepdim && self.dim() != 0) {
+        grad = unsqueeze_multiple(grad, dim, ndim);
+        norm = unsqueeze_multiple(norm, dim, ndim);
+    }
+
+    if (p == 0.0) {
+        return {};
+    } else if (p == 1.0) {
+        return self.sgn() * grad;
+    } else if (p == 2.0) {
+        return grad * (self / norm).masked_fill_(norm == 0, 0);
+    } else if (std::isinf(p)) {
+        // Derivative of amax(abs(self), dim, keepdim) but respecting nans
+        // We create a mask of `argmax`: it's argmax if self.abs() == norm or it's
+        // NaN
+        auto self_abs = self.abs();
+        auto mask = self_abs.eq(norm).logical_or(self_abs.isnan());
+        return self.sgn() * ((grad / mask.sum(dim, true)) * mask);
+    } else if (p < 1.0) {
+        self_scaled = self.sgn() * self.abs().pow_(p - 1).masked_fill_(self == 0, 0);
+        return self_scaled * grad * norm.pow(1 - p);
+    } else if (p < 2.0) {
+        self_scaled = self.sgn() * self.abs().pow_(p - 1);
+        scale_v = grad / norm.pow(p - 1);
+        scale_v.masked_fill_(norm == 0, 0);
+        return self_scaled * scale_v;
+    } else {
+        self_scaled = self * self.abs().pow_(p - 2);
+        scale_v = grad / norm.pow(p - 1);
+        scale_v.masked_fill_(norm == 0, 0);
+        return self_scaled * scale_v;
+    }
+}
+
+diopiError_t diopiNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t self,
+                               diopiConstTensorHandle_t norm, diopiSize_t dim, const diopiScalar_t* p) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atSelf = impl::aten::buildATen(self);
+    auto atP = impl::aten::buildAtScalar(p);
+    auto atNorm = impl::aten::buildATen(norm);
+
+    at::IntArrayRef atDim = impl::aten::buildAtIntArray(dim);
+    bool keepdim = false;
+
+    if (atSelf.dim() == atNorm.dim()) {
+        keepdim = true;
+    }
+
+    auto atGradInput = norm_backward(atGradOutput, atSelf, atP, atNorm, atDim, keepdim);
+
+    if (!atGradInput.defined()) {
+        return diopiSuccess;
+    }
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+    return diopiSuccess;
+}
+
 diopiError_t diopiForeachnormScalar(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputSize,
                                     const diopiScalar_t* ord) {
     DIOPI_CHECK_PTR(outs);
@@ -3489,6 +4154,31 @@ diopiError_t diopiLayerNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, d
     return diopiSuccess;
 }
 
+diopiError_t diopiLayerNormGB(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t running_mean, diopiTensorHandle_t running_var,
+                              diopiConstTensorHandle_t input, diopiConstTensorHandle_t scale, diopiConstTensorHandle_t bias, const double eps,
+                              const int64_t begin_norm_axis) {
+    impl::aten::setCurStream(ctx);
+
+    auto atOut = impl::aten::buildATen(out);
+    auto atMean = impl::aten::buildATen(running_mean);
+    auto atVar = impl::aten::buildATen(running_var);
+
+    auto atInput = impl::aten::buildATen(input);
+
+    DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atBias, bias);
+    DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atScale, scale);
+
+    at::IntArrayRef atNormalizedShape(atInput.sizes().begin() + begin_norm_axis, atInput.sizes().end());
+
+    diopi_tensor_list vecOut = {out, running_mean, running_var};
+
+    auto Out = CALL_ATEN_CUDA_FUNC(native_layer_norm, atInput, atNormalizedShape, atScale, atBias, eps);
+
+    impl::aten::updateATen2Tensor(ctx, Out, vecOut);
+
+    return diopiSuccess;
+}
+
 diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight, diopiTensorHandle_t grad_bias,
                                     diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
                                     diopiConstTensorHandle_t bias, diopiConstTensorHandle_t mean, diopiConstTensorHandle_t rstd, diopiSize_t normalized_shape) {
@@ -3537,6 +4227,182 @@ diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_
     return diopiSuccess;
 }
 
+diopiError_t diopiLayerNormGBBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight, diopiTensorHandle_t grad_bias,
+                                      diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
+                                      diopiConstTensorHandle_t bias, diopiConstTensorHandle_t running_mean, diopiConstTensorHandle_t running_std,
+                                      const int64_t begin_norm_axis) {
+    impl::aten::setCurStream(ctx);
+    diopiDtype_t mDtype, rDtype;
+    if (running_std) {
+        diopiGetTensorDtype(running_std, &rDtype);
+    }
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+
+    at::IntArrayRef atNormalizedShape(atInput.sizes().begin() + begin_norm_axis, atInput.sizes().end());
+
+    DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atWeight, weight);
+    DIOPI_IMPL_BUILD_ATEN_OPTIONAL(atBias, bias);
+    auto grad_input_mask = std::array<bool, 3>{true, atWeight.has_value(), atBias.has_value()};
+
+    auto atSaveMean = impl::aten::buildATen(running_mean);
+    diopiGetTensorDtype(running_mean, &mDtype);
+    if (diopiDtype_t::diopi_dtype_float16 == mDtype) {
+        atSaveMean = at::native::to(atSaveMean, impl::aten::getATenType(diopiDtype_t::diopi_dtype_float32).toScalarType(), false, true, c10::nullopt);
+    }
+    auto atSaveVar = impl::aten::buildATen(running_std);
+    diopiGetTensorDtype(running_std, &rDtype);
+    if (diopiDtype_t::diopi_dtype_float16 == rDtype) {
+        atSaveVar = at::native::to(atSaveVar, impl::aten::getATenType(diopiDtype_t::diopi_dtype_float32).toScalarType(), false, true, c10::nullopt);
+    }
+
+    if (grad_input && grad_weight && grad_bias) {
+        auto atGradInput = impl::aten::buildATen(grad_input);
+        auto atGradWeight = impl::aten::buildATen(grad_weight);
+        auto atGradBias = impl::aten::buildATen(grad_bias);
+        at::native_layer_norm_backward_out(
+            atGradInput, atGradWeight, atGradBias, atGradOutput, atInput, atNormalizedShape, atSaveMean, atSaveVar, atWeight, atBias, grad_input_mask);
+    } else {
+        auto atOut = at::native_layer_norm_backward(atGradOutput, atInput, atNormalizedShape, atSaveMean, atSaveVar, atWeight, atBias, grad_input_mask);
+        if (grad_input) {
+            impl::aten::updateATen2Tensor(ctx, std::get<0>(atOut), grad_input);
+        }
+        if (grad_weight) {
+            impl::aten::updateATen2Tensor(ctx, std::get<1>(atOut), grad_weight);
+        }
+        if (grad_bias) {
+            impl::aten::updateATen2Tensor(ctx, std::get<2>(atOut), grad_bias);
+        }
+    }
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiInstanceNorm(diopiContextHandle_t ctx, diopiTensorHandle_t output, diopiConstTensorHandle_t input, const int64_t axis,
+                               diopiConstTensorHandle_t scale, diopiConstTensorHandle_t bias, const double eps) {
+    impl::aten::setCurStream(ctx);
+
+    auto atInput = impl::aten::buildATen(input);
+    auto atScale = impl::aten::buildATen(scale);
+    auto atBias = impl::aten::buildATen(bias);
+    auto input_size = atInput.sizes().vec();
+
+    std::vector<int64_t> reshaped_size = {1};
+    int64_t shape = 1;
+    for (int i = 0; i < std::min(axis, (int64_t)input_size.size()); i++) {
+        shape = shape * input_size[i];
+    }
+    reshaped_size.push_back(shape);
+    for (int i = axis; i < input_size.size(); i++) {
+        reshaped_size.push_back(input_size[i]);
+    }
+
+    auto atInputReshaped = atInput.contiguous().view(reshaped_size);
+    auto atScale_ = atScale.repeat(input_size[0]);
+    auto atBias_ = atBias.repeat(input_size[0]);
+
+    // auto atRunningMean = torch::empty({reshaped_size[1]}, torch::TensorOptions().device(torch::kCUDA));
+    // auto atRunningStd = torch::empty({reshaped_size[1]}, torch::TensorOptions().device(torch::kCUDA));
+
+    auto atOutput = CALL_ATEN_FUNC(batch_norm, atInputReshaped, atScale_, atBias_, c10::nullopt, c10::nullopt, true, 0.0, eps, false);
+    impl::aten::updateATen2Tensor(ctx, atOutput, output);
+    return diopiSuccess;
+}
+
+diopiError_t diopiInstanceNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_scale, diopiTensorHandle_t grad_bias,
+                                       diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input, diopiConstTensorHandle_t scale,
+                                       diopiConstTensorHandle_t bias, const int64_t axis, const double eps) {
+    impl::aten::setCurStream(ctx);
+
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    auto atScale = impl::aten::buildATen(scale);
+    auto atBias = impl::aten::buildATen(bias);
+    auto input_size = atInput.sizes().vec();
+
+    auto atScale_ = atScale.repeat(input_size[0]);
+
+    std::vector<int64_t> reshaped_size = {1};
+    int64_t shape = 1;
+    for (int i = 0; i < std::min(axis, (int64_t)input_size.size()); i++) {
+        shape = shape * input_size[i];
+    }
+    reshaped_size.push_back(shape);
+    for (int i = axis; i < input_size.size(); i++) {
+        reshaped_size.push_back(input_size[i]);
+    }
+
+    std::vector<int64_t> mean_dim = {0};
+    for (int i = 2; i < reshaped_size.size(); i++) {
+        mean_dim.push_back(i);
+    }
+
+    auto atInputReshaped = atInput.contiguous().view(reshaped_size);
+    auto atGradOutputReshaped = atGradOutput.contiguous().view(reshaped_size);
+
+    auto atMean = torch::mean(atInputReshaped, mean_dim);
+    auto atStd = torch::std(atInputReshaped, mean_dim);
+
+    auto grad_input_mask = std::array<bool, 3>{grad_input != nullptr, grad_scale != nullptr, grad_bias != nullptr};
+
+    auto atOut =
+        at::native_batch_norm_backward(atGradOutputReshaped, atInputReshaped, atScale_, c10::nullopt, c10::nullopt, atMean, atStd, true, eps, grad_input_mask);
+
+    if (grad_input) {
+        impl::aten::updateATen2Tensor(ctx, std::get<0>(atOut), grad_input);
+    }
+    if (grad_scale) {
+        auto atGradScaleOrigin = torch::zeros_like(atScale);
+        for (int i = 0; i < std::get<1>(atOut).size(0); i++) {
+            atGradScaleOrigin[i % (atScale_.size(0) / input_size[0])] += std::get<1>(atOut)[i];
+        }
+        impl::aten::updateATen2Tensor(ctx, atGradScaleOrigin, grad_scale);
+    }
+    if (grad_bias) {
+        auto atGradBiasOrigin = torch::zeros_like(atBias);
+        for (int i = 0; i < std::get<2>(atOut).size(0); i++) {
+            atGradBiasOrigin[i % (atScale_.size(0) / input_size[0])] += std::get<2>(atOut)[i];
+        }
+        impl::aten::updateATen2Tensor(ctx, atGradBiasOrigin, grad_bias);
+    }
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiNormalize(diopiContextHandle_t ctx, diopiTensorHandle_t output, diopiConstTensorHandle_t input, const float p, const int64_t axis,
+                            const double eps) {
+    impl::aten::setCurStream(ctx);
+
+    auto atInput = impl::aten::buildATen(input);
+
+    auto atOut = impl::aten::buildATen(output);
+
+    auto atDenom = atInput.norm(p, axis, true).clamp_min(eps).expand_as(atInput);
+
+    CALL_ATEN_FUNC(div_out, atOut, atInput, atDenom);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiNormalizeBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                    diopiConstTensorHandle_t input, const float p, const int64_t axis, const double eps) {
+    impl::aten::setCurStream(ctx);
+
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    auto atNorm = atInput.norm(p, axis, true);
+    auto atClamp = atNorm.clamp_min(eps);
+    auto atDenom = atClamp.expand_as(atInput);
+    auto atGradDenom = atInput * (-1 / atDenom / atDenom) * atGradOutput;
+    auto atGradClamp = atGradDenom.sum(axis, true);
+    auto atGradNorm = atGradClamp.masked_fill_(atNorm < eps, 0);
+    auto atGradOriginInput = norm_backward(atGradNorm, atInput, p, atNorm, axis, true);
+    auto atGradInput = (1 / atDenom) * atGradOutput + atGradOriginInput;
+
+    impl::aten::updateATen2Tensor(ctx, atGradInput, grad_input);
+    return diopiSuccess;
+}
+
 diopiError_t diopiAdaptiveAvgPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
@@ -3641,6 +4507,69 @@ diopiError_t diopiMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_
     return diopiSuccess;
 }
 
+diopiError_t diopiAvgPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride,
+                            diopiSize_t padding, bool ceil_mode, bool count_include_pad, const int64_t* divisor_override) {
+    impl::aten::setCurStream(ctx);
+
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    c10::optional<int64_t> atDivisorOverride = divisor_override ? c10::optional<int64_t>(*divisor_override) : c10::nullopt;
+    auto atOut = impl::aten::buildATen(out);
+    CALL_ATEN_CUDA_FUNC(avg_pool3d_out, atOut, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride);
+
+    return diopiSuccess;
+}
+
+diopiError_t diopiAvgPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                    diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode,
+                                    bool count_include_pad, const int64_t* divisor_override) {
+    impl::aten::setCurStream(ctx);
+    auto atGradOutput = impl::aten::buildATen(grad_output);
+    auto atInput = impl::aten::buildATen(input);
+    at::IntArrayRef atKernelSize = impl::aten::buildAtIntArray(kernel_size);
+    at::IntArrayRef atStride = impl::aten::buildAtIntArray(stride);
+    at::IntArrayRef atPadding = impl::aten::buildAtIntArray(padding);
+    c10::optional<int64_t> atDivisorOverride = divisor_override ? c10::optional<int64_t>(*divisor_override) : c10::nullopt;
+    auto atGradInput = impl::aten::buildATen(grad_input);
+    CALL_ATEN_CUDA_FUNC(
+        avg_pool3d_backward_out, atGradInput, atGradOutput, atInput, atKernelSize, atStride, atPadding, ceil_mode, count_include_pad, atDivisorOverride);
+    return diopiSuccess;
+}
+
+diopiError_t diopiPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                         diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                         diopiSize_t output_size) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool3d(ctx, out, input, ksize, stride, padding, dilation, ceil_mode);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool3d(ctx, out, input, ksize, stride, padding, ceil_mode, !exclusive, nullptr);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool3d(ctx, out, input, output_size);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool3d(ctx, out, input, output_size);
+    }
+}
+
+diopiError_t diopiPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                 const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode,
+                                 const bool exclusive, const bool adaptive, diopiConstTensorHandle_t indices) {
+    impl::aten::setCurStream(ctx);
+
+    if (adaptive == false && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiMaxPool3dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, dilation, ceil_mode, indices);
+    } else if (adaptive == false && strcmp(mode, "avg") == 0) {
+        return impl::cuda::diopiAvgPool3dBackward(ctx, grad_input, grad_output, input, ksize, stride, padding, ceil_mode, !exclusive, nullptr);
+    } else if (adaptive == true && strcmp(mode, "max") == 0) {
+        return impl::cuda::diopiAdaptiveMaxPool3dBackward(ctx, grad_input, grad_output, input, indices);
+    } else {
+        return impl::cuda::diopiAdaptiveAvgPool3dBackward(ctx, grad_input, grad_output, input);
+    }
+}
+
 diopiError_t diopiPermute(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t dims) {
     impl::aten::setCurStream(ctx);
     auto atInput = impl::aten::buildATen(input);
diff --git a/proto/include/diopi/functions.h b/proto/include/diopi/functions.h
index bbedfa525..4f7dfcecb 100644
--- a/proto/include/diopi/functions.h
+++ b/proto/include/diopi/functions.h
@@ -302,6 +302,172 @@ DIOPI_API diopiError_t diopiLeakyReluInp(diopiContextHandle_t ctx, diopiTensorHa
 DIOPI_API diopiError_t diopiLeakyReluBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
                                               diopiConstTensorHandle_t input, const diopiScalar_t* negative_slope, bool input_is_result);
 
+/**
+ * @brief Performs 1D max pooling on the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after max pooling.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ */
+DIOPI_API diopiError_t diopiMaxPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size,
+                                      diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode);
+
+/**
+ * @brief Performs the backward pass for diopiMaxPool1d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] indices the tensor storing the indices of max elements from the forward pass.
+ */
+DIOPI_API diopiError_t diopiMaxPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                              diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding,
+                                              diopiSize_t dilation, bool ceil_mode, diopiConstTensorHandle_t indices);
+
+/**
+ * @brief Performs 1D max pooling on the input tensor and returns the indices of max elements.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after max pooling.
+ * @param[out] indices the tensor storing the indices of max elements.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ */
+DIOPI_API diopiError_t diopiMaxPool1dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices, diopiConstTensorHandle_t input,
+                                                 diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, bool ceil_mode);
+
+/**
+ * @brief Performs 1D average pooling on the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after average pooling.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] count_include_pad whether to include padding in the count for averaging.
+ */
+DIOPI_API diopiError_t diopiAvgPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size,
+                                      diopiSize_t stride, diopiSize_t padding, bool ceil_mode, bool count_include_pad);
+
+/**
+ * @brief Performs the backward pass for diopiAvgPool1d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] count_include_pad whether to include padding in the count for averaging.
+ */
+DIOPI_API diopiError_t diopiAvgPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                              diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode,
+                                              bool count_include_pad);
+
+/**
+ * @brief Performs 1D adaptive max pooling on the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after adaptive max pooling.
+ * @param[in] input the input tensor.
+ * @param[in] output_size the size of the output after pooling.
+ */
+DIOPI_API diopiError_t diopiAdaptiveMaxPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size);
+
+/**
+ * @brief Performs 1D adaptive max pooling on the input tensor and returns the indices of max elements.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after adaptive max pooling.
+ * @param[out] indices the tensor storing the indices of max elements.
+ * @param[in] input the input tensor.
+ * @param[in] output_size the size of the output after pooling.
+ */
+DIOPI_API diopiError_t diopiAdaptiveMaxPool1dWithIndices(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t indices,
+                                                         diopiConstTensorHandle_t input, diopiSize_t output_size);
+
+/**
+ * @brief Performs the backward pass for diopiAdaptiveMaxPool1d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] indices the tensor storing the indices of max elements from the forward pass.
+ */
+DIOPI_API diopiError_t diopiAdaptiveMaxPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                                      diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices);
+
+/**
+ * @brief Performs the backward pass for diopiAdaptiveAvgPool1d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ */
+DIOPI_API diopiError_t diopiAdaptiveAvgPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                                      diopiConstTensorHandle_t input);
+
+/**
+ * @brief Performs 1D adaptive average pooling on the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after adaptive average pooling.
+ * @param[in] input the input tensor.
+ * @param[in] output_size the size of the output after pooling.
+ */
+DIOPI_API diopiError_t diopiAdaptiveAvgPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t output_size);
+
+/**
+ * @brief General 1D pooling operation with support for multiple pooling modes.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after pooling.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] output_size the size of the output after pooling (if adaptive pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool1d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                                   diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive,
+                                   const bool adaptive, diopiSize_t output_size);
+
+/**
+ * @brief Performs the backward pass for diopiPool1d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] indices the tensor storing the indices of max elements from the forward pass (if max pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool1dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                           diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding,
+                                           diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                                           diopiConstTensorHandle_t indices);
+
 /**
  * @brief Applies 2D average-pooling operation in kH×kW regions by step size sH×sW steps.
  * @param[in] ctx Context environment.
@@ -387,6 +553,45 @@ DIOPI_API diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopi
 DIOPI_API diopiError_t diopiMaxPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
                                               diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding,
                                               diopiSize_t dilation, bool ceil_mode, diopiConstTensorHandle_t indices);
+/**
+ * @brief General 2D pooling operation with support for multiple pooling modes.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after pooling.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] output_size the size of the output after pooling (if adaptive pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool2d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                                   diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive,
+                                   const bool adaptive, diopiSize_t output_size);
+
+/**
+ * @brief Performs the backward pass for diopiPool2d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] indices the tensor storing the indices of max elements from the forward pass (if max pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool2dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                           diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding,
+                                           diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                                           diopiConstTensorHandle_t indices);
 
 /**
  * @brief Applies a 2D adaptive average pooling over an input signal composed of several input planes.
@@ -878,6 +1083,68 @@ DIOPI_API diopiError_t diopiCosInp(diopiContextHandle_t ctx, diopiTensorHandle_t
  */
 DIOPI_API diopiError_t diopiCos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
 
+/**
+ * @brief The in-place version of diopiAcos().
+ * @param[in] ctx Context environment.
+ * @param[in] input the input and output tensor and will be stored result tensor,
+ * type = [float16, float32, float64, int16, int32, int64, uint8, int8].
+ */
+DIOPI_API diopiError_t diopiAcosInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Compute the element-wise arccosine values of the input tensor input.
+ * @param[in] ctx Context environment.
+ * @param[in] input Input tensor, type = [float16, float32, float64, int16, int32, int64, uint8, int8].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAcos(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief The in-place version of diopiTan().
+ * @param[in] ctx Context environment.
+ * @param[in] input the input and output tensor and will be stored result tensor,
+ * type = [float16, float32, float64, int16, int32, int64, uint8, int8].
+ */
+DIOPI_API diopiError_t diopiTanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Compute the element-wise tangent values of the input tensor input.
+ * @param[in] ctx Context environment.
+ * @param[in] input Input tensor, type = [float16, float32, float64, int16, int32, int64, uint8, int8].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiTan(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief The in-place version of diopiSinh().
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiSinhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the hyperbolic sine of the elements of input.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ * @param[out] out the input tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiSinh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief The in-place version of diopiCosh().
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiCoshInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the hyperbolic cosine of the elements of input.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ * @param[out] out the input tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiCosh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
 /**
  * @brief The in-place version of diopiTanh().
  * @param[in] ctx Context environment.
@@ -915,6 +1182,51 @@ DIOPI_API diopiError_t diopiAtan(diopiContextHandle_t ctx, diopiTensorHandle_t o
  */
 DIOPI_API diopiError_t diopiAtanInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
 
+/**
+ * @brief The in-place version of diopiAsinh().
+ * @param[in] ctx Context environment.
+ * @param[inout] input the input tensor and will be stroed reuslt tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAsinhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the arc hyperbolic sine of the elements of input.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAsinh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief The in-place version of diopiAcosh().
+ * @param[in] ctx Context environment.
+ * @param[inout] input the input tensor and will be stroed reuslt tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAcoshInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the arc hyperbolic cosine of the elements of input.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAcosh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief The in-place version of diopiAtanh().
+ * @param[in] ctx Context environment.
+ * @param[inout] input the input tensor and will be stroed reuslt tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAtanhInp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the arc hyperbolic tangent of the elements of input.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiAtanh(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
 /**
  * @brief The in-place version of diopiSigmoid().
  * @param[in] ctx Context environment.
@@ -978,6 +1290,22 @@ DIOPI_API diopiError_t diopiExpInp(diopiContextHandle_t ctx, diopiTensorHandle_t
  */
 DIOPI_API diopiError_t diopiExp(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
 
+/**
+ * @brief The in-place version of diopiExpm1().
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor and will be stroed reuslt tensor. type = [float16, float32, float64]
+ */
+DIOPI_API diopiError_t diopiExpm1Inp(diopiContextHandle_t ctx, diopiTensorHandle_t input);
+
+/**
+ * @brief Returns a new tensor with the exponential of the elements of the input tensor input and minus 1
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float16, float32, float64, int16, int32,
+ * int64, uint8, int8, bool].
+ * @param[out] out the output tensor. type = [float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiExpm1(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
 /**
  * @brief The in-place version of diopiLog().
  * @param[in] ctx Context environment.
@@ -2363,6 +2691,20 @@ DIOPI_API diopiError_t diopiStack(diopiContextHandle_t ctx, diopiTensorHandle_t
 DIOPI_API diopiError_t diopiSort(diopiContextHandle_t ctx, diopiTensorHandle_t values, diopiTensorHandle_t indices, diopiConstTensorHandle_t input, int64_t dim,
                                  bool descending, const bool* pStable);
 
+/**
+ * @brief Computes the gradient of the input tensor with respect to the sorted output tensor during backpropagation.
+ * @param[in] ctx Context environment.
+ * @param[in] grad_output Tensor containing the gradient of the loss with respect to the sorted output.
+ * @param[in] dim The dimension along which the sorting was performed. This is used to correctly align the gradients with the corresponding dimension.
+ * @param[in] indices Tensor containing the indices that were used to sort the input tensor.
+ * @param[in] sizes The size of the tensor, which is necessary to manage the shape and alignment during gradient computation.
+ * @param[in] keepdim Boolean flag indicating whether to retain the reduced dimensions or not. If `true`, the dimensions that were reduced are retained with
+ * size one, which affects how gradients are accumulated.
+ * @param[out] grad_input Tensor to store the gradient with respect to the input tensor. This tensor will be updated with the computed gradient.
+ */
+DIOPI_API diopiError_t diopiSortBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, int64_t dim,
+                                         diopiConstTensorHandle_t indices, diopiSize_t sizes, bool keepdim);
+
 /**
  * @brief Returns the k largest elements of the given input tensor along a given dimension.
  * @param[in] ctx Context environment.
@@ -2551,6 +2893,44 @@ DIOPI_API diopiError_t diopiConvTranspose2dBackward(diopiContextHandle_t ctx, di
                                                     diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding,
                                                     diopiSize_t dilation, diopiSize_t output_padding, int64_t groups);
 
+/**
+ * @brief Applies a 3D transposed convolution operator over an input image composed of several input planes, sometimes also called “deconvolution”.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type = [float32, float16, float64].
+ * @param[in] weight the weight tensor; dimension of kernel_size must match the number of input spatial dimensions.
+ * type = [float32, float16, float64].
+ * @param[in] bias bias tensor. type = [float32, float16, float64].
+ * @param[in] stride an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] padding an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] output_padding an array, dimension == number of input spatial dimensions; only supported when transposed is true. type = [int32, int64].
+ * @param[in] dilation an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] groups number of groups for grouped convolution. type = [int64].
+ * @param[out] out the result tensor. type = [float32, float16, float64].
+ */
+DIOPI_API diopiError_t diopiConvTranspose3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
+                                            diopiConstTensorHandle_t bias, diopiSize_t stride, diopiSize_t padding, diopiSize_t output_padding, int64_t groups,
+                                            diopiSize_t dilation);
+
+/**
+ * @brief Backward pass for ConvTranspose3dBackward. Computes gradients for input, weight, and bias.
+ * @param[in] ctx Context environment.
+ * @param[in] grad_output the grad tensor of output. type = [float32, float16, float64].
+ * @param[in] bias_sizes an array, indicates that a bias was used in the forward pass and contains the shape of the bias. type = [int32, int64].
+ * @param[in] input the input tensor. type = [float32, float16, float64].
+ * @param[in] weight the weight tensor; dimension of kernel_size must match the number of input spatial dimensions.
+ * @param[in] stride an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] padding an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] output_padding an array, dimension == number of input spatial dimensions; only supported when transposed is true. type = [int32, int64].
+ * @param[in] dilation an array with dimension matching the number of input spatial dimensions. type = [int32, int64].
+ * @param[in] groups number of groups for grouped convolution. type = [int64].
+ * @param[out] grad_input the grad of input. type = [float32, float16, float64].
+ * @param[out] grad_weight the grad of weight. type = [float32, float16, float64].
+ * @param[out] grad_bias the grad of bias. type = [float32, float16, float64].
+ */
+DIOPI_API diopiError_t diopiConvTranspose3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight,
+                                                    diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                                    diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding,
+                                                    diopiSize_t dilation, diopiSize_t output_padding, int64_t groups);
 /**
  * @brief Extracts sliding local blocks from a batched input tensor.
  * @param[in] ctx Context environment.
@@ -2583,6 +2963,15 @@ DIOPI_API diopiError_t diopiUnfoldBackward(diopiContextHandle_t ctx, diopiTensor
  */
 DIOPI_API diopiError_t diopiCumsum(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, int64_t dim);
 
+/**
+ * @brief Computes the backward pass for diopiCumsum()
+ * @param[in] ctx Context environment.
+ * @param[in] grad_output the grad tensor of output, with the same shape as the forward pass output. type=[float16, float32, float64].
+ * @param[in] dim the dimension to do the operation over. type = [int64].
+ * @param[out] grad_input the grad tensor of input, with the same shape as the forward pass input. type=[float16, float32, float64].
+ */
+DIOPI_API diopiError_t diopiCumsumBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output, int64_t dim);
+
 /**
  * @brief Computes batched the p-norm distance between each pair of the two collections of row vectors.
  * @param[in] ctx Context environment.
@@ -2616,6 +3005,28 @@ DIOPI_API diopiError_t diopiCdistBackward(diopiContextHandle_t ctx, diopiTensorH
  */
 DIOPI_API diopiError_t diopiArgmax(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim, bool keepdim);
 
+/**
+ * @brief Returns the indices of the minimum values of a tensor across a dimension.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type=[float32, float64, float16, int16, int32, int64, uint8, int8, bool].
+ * @param[in] dim the dimension to do the operation over. type=[int32, int64].
+ * @param[in] keepdim whether the output tensor has dim retained or not.
+ * @param[out] out the output tensor. type=[int32, int64].
+ */
+DIOPI_API diopiError_t diopiArgmin(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const int64_t* dim, bool keepdim);
+
+/**
+ * @brief Returns the indices that sort a tensor along a given dimension in ascending order by value.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor. type=[float32, float64, float16, int16, int32, int64, uint8, int8, bool].
+ * @param[in] dim the dimension to do the operation over. type=[int32, int64].
+ * @param[in] descending controls the sorting order (ascending or descending).
+ * @param[in] stable controls the relative order of equivalent elements.
+ * @param[out] out the output tensor. type=[int32, int64].
+ */
+DIOPI_API diopiError_t diopiArgsort(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, bool stable, const int64_t* dim,
+                                    bool descending);
+
 /**
  * @brief The function is used to implement the Adadelta optimizer. Its functionality is to perform a single parameter update.
  * @param[in] ctx Context environment.
@@ -2730,6 +3141,37 @@ DIOPI_API diopiError_t diopiConvolution3dBackward(diopiContextHandle_t ctx, diop
                                                   diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
                                                   diopiConstTensorHandle_t weight, diopiSize_t* bias_sizes, diopiSize_t stride, diopiSize_t padding,
                                                   diopiSize_t dilation, int64_t groups);
+/**
+ * @brief Performs 3D average pooling operation.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after average pooling.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] count_include_pad whether to include the zero-padding in the averaging calculation.
+ * @param[in] divisor_override if provided, it will be used as the divisor for averaging.
+ */
+DIOPI_API diopiError_t diopiAvgPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiSize_t kernel_size,
+                                      diopiSize_t stride, diopiSize_t padding, bool ceil_mode, bool count_include_pad, const int64_t* divisor_override);
+
+/**
+ * @brief Performs the backward pass for diopiAvgPool3d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] kernel_size the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] count_include_pad whether to include the zero-padding in the averaging calculation.
+ * @param[in] divisor_override if provided, it will be used as the divisor for averaging.
+ */
+DIOPI_API diopiError_t diopiAvgPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                              diopiConstTensorHandle_t input, diopiSize_t kernel_size, diopiSize_t stride, diopiSize_t padding, bool ceil_mode,
+                                              bool count_include_pad, const int64_t* divisor_override);
 
 /**
  * \brief Applies a 3D max pooling over an input signal composed of several input planes.
@@ -2827,6 +3269,46 @@ DIOPI_API diopiError_t diopiAdaptiveMaxPool3dWithIndices(diopiContextHandle_t ct
 DIOPI_API diopiError_t diopiAdaptiveMaxPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
                                                       diopiConstTensorHandle_t input, diopiConstTensorHandle_t indices);
 
+/**
+ * @brief General 3D pooling operation with support for multiple pooling modes.
+ * @param[in] ctx Context environment.
+ * @param[out] out the output tensor after pooling.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] output_size the size of the output after pooling (if adaptive pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool3d(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize,
+                                   diopiSize_t stride, diopiSize_t padding, diopiSize_t dilation, const bool ceil_mode, const bool exclusive,
+                                   const bool adaptive, diopiSize_t output_size);
+
+/**
+ * @brief Performs the backward pass for diopiPool3d(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] mode the pooling mode, such as "max" or "avg".
+ * @param[in] ksize the size of the pooling window.
+ * @param[in] stride the stride of the pooling window.
+ * @param[in] padding implicit padding added to the input.
+ * @param[in] dilation the spacing between elements in the pooling window.
+ * @param[in] ceil_mode whether to use ceil instead of floor for output shape calculation.
+ * @param[in] exclusive whether to exclude padding when averaging.
+ * @param[in] adaptive whether to use adaptive pooling.
+ * @param[in] indices the tensor storing the indices of max elements from the forward pass (if max pooling is used).
+ */
+DIOPI_API diopiError_t diopiPool3dBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                           diopiConstTensorHandle_t input, const char* mode, diopiSize_t ksize, diopiSize_t stride, diopiSize_t padding,
+                                           diopiSize_t dilation, const bool ceil_mode, const bool exclusive, const bool adaptive,
+                                           diopiConstTensorHandle_t indices);
+
 /**
  * \brief Returns a new 1-D tensor which indexes the input tensor according to the boolean mask.
  * @param[in] ctx Context environment.
@@ -2991,6 +3473,37 @@ DIOPI_API diopiError_t diopiFlip(diopiContextHandle_t ctx, diopiTensorHandle_t o
  */
 DIOPI_API diopiError_t diopiNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, const diopiScalar_t* p, diopiSize_t dim);
 
+/**
+ * @brief Compute the backward pass for diopiNorm(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[in] grad_output the grad tensor of output. type=[float32, float64, float16].
+ * @param[in] self input tensor. type=[float32, float64, float16].
+ * @param[in] norm norm tensor. type=[float32, float64, float16].
+ * @param[in] dim Specifies which dimension or dimensions of input to calculate the norm across.
+ * @param[in] p an array, the order of norm.
+ * @param[out] grad_input the grad of input. type=[float32, float64, float16].
+ */
+DIOPI_API diopiError_t diopiNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                         diopiConstTensorHandle_t self, diopiConstTensorHandle_t norm, diopiSize_t dim, const diopiScalar_t* p);
+
+/**
+ * @brief Applies Layer Normalization over a mini-batch of inputs.
+ * Note that, this is the national standard GB operator version, which is different from the diopiLayerNorm interface definition, normalized_shape has changed
+ * to begin_norm_axis. type=[float32, float64, float16].
+ * @param[in] ctx Context environment.
+ * @param[in] save_mean Mean tensor,the mean value for each feature channel of the input tensor. type=[float32, float64, float16].
+ * @param[in] save_invstd Backup of inverse standard deviation computed during training. type=[float32, float64, float16].
+ * @param[in] input input tensor. type=[float32, float64, float16].
+ * @param[in] weight weight tensor. type=[float32, float64, float16].
+ * @param[in] bias bias tensor. type=[float32, float64, float16].
+ * @param[in] begin_norm_axis int64, Indicates which dimension to start normalization.
+ * @param[in] eps float64 a value added to the denominator for numerical stability.
+ * @param[out] out normalized result. type=[float32, float64, float16].
+ */
+DIOPI_API diopiError_t diopiLayerNormGB(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t running_mean, diopiTensorHandle_t running_var,
+                                        diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, const double eps,
+                                        const int64_t begin_norm_axis);
+
 /**
  * @brief Returns the matrix norm or vector norm of a given tensor list.
  * @param[in] ctx Context environment.
@@ -3361,6 +3874,17 @@ DIOPI_API diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandl
  */
 DIOPI_API diopiError_t diopiMeshGrid(diopiContextHandle_t ctx, diopiTensorHandle_t* outs, diopiConstTensorHandle_t* inputs, int64_t inputsNum);
 
+/**
+ * @brief Compute grid sample.
+ * @param[in] ctx Context environment.
+ * @param[in] input the original tensor to be sampled.
+ * @param[in] grid the pixel locations of sampling.
+ * @param[in] mode the sampling mode. [bilinear, nearest].
+ * @param[out] out the result sampling tensor.
+ */
+DIOPI_API diopiError_t diopiGridSample(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t grid,
+                                       const char* mode);
+
 /**
  * @brief Returns a tensor where each row contains num_samples indices sampled from the
  * multinomial probability distribution located in the corresponding row of tensor input.
@@ -3408,6 +3932,79 @@ DIOPI_API diopiError_t diopiLayerNormBackward(diopiContextHandle_t ctx, diopiTen
                                               diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiConstTensorHandle_t mean,
                                               diopiConstTensorHandle_t rstd, diopiSize_t normalized_shape);
 
+/**
+ * @brief Compute the backward pass for diopiLayerNormGB(). Computes gradients for input, weight, and bias.
+ * Note that, this is the national standard GB operator version, which is different from the diopiLayerNormBackward interface definition, normalized_shape has
+ * changed to begin_norm_axis
+ * @param[in] ctx Context environment.
+ * @param[in] grad_output the grad tensor of output. type=[float32, float64, float16].
+ * @param[in] grad_bias the grad of bias. type=[float32, float64, float16].
+ * @param[in] grad_weight the grad of weight. type=[float32, float64, float16].
+ * @param[in] mean Mean tensor,the mean value for each feature channel of the input tensor. type=[float32, float64, float16].
+ * @param[in] rstd Backup of inverse standard deviation computed during training. type=[float32, float64, float16].
+ * @param[in] input input tensor. type=[float32, float64, float16].
+ * @param[in] weight weight tensor. type=[float32, float64, float16].
+ * @param[in] bias bias tensor. type=[float32, float64, float16].
+ * @param[in] begin_norm_axis int64, Indicates which dimension to start normalization.
+ * @param[out] grad_input the grad of input. type=[float32, float64, float16].
+ */
+DIOPI_API diopiError_t diopiLayerNormGBBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_weight,
+                                                diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                                diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiConstTensorHandle_t running_mean,
+                                                diopiConstTensorHandle_t running_std, const int64_t begin_norm_axis);
+
+/**
+ * @brief Performs instance normalization on the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[out] output the output tensor after instance normalization.
+ * @param[in] input the input tensor to be normalized.
+ * @param[in] axis the axis along which normalization is applied.
+ * @param[in] scale the scale tensor.
+ * @param[in] bias the bias tensor.
+ * @param[in] eps small value to avoid division by zero during normalization.
+ */
+DIOPI_API diopiError_t diopiInstanceNorm(diopiContextHandle_t ctx, diopiTensorHandle_t output, diopiConstTensorHandle_t input, const int64_t axis,
+                                         diopiConstTensorHandle_t scale, diopiConstTensorHandle_t bias, const double eps);
+/**
+ * @brief Performs the backward pass for diopiInstanceNorm(). Computes gradients for input, scale, and bias.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[out] grad_scale the gradient tensor of scale.
+ * @param[out] grad_bias the gradient tensor of bias.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] scale the scale tensor.
+ * @param[in] bias the bias tensor.
+ * @param[in] axis the axis along which normalization is applied.
+ * @param[in] eps small value to avoid division by zero during normalization.
+ */
+DIOPI_API diopiError_t diopiInstanceNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiTensorHandle_t grad_scale,
+                                                 diopiTensorHandle_t grad_bias, diopiConstTensorHandle_t grad_output, diopiConstTensorHandle_t input,
+                                                 diopiConstTensorHandle_t scale, diopiConstTensorHandle_t bias, const int64_t axis, const double eps);
+/**
+ * @brief Normalizes the input tensor based on the p-norm along the given axis.
+ * @param[in] ctx Context environment.
+ * @param[out] output the normalized output tensor.
+ * @param[in] input the input tensor to be normalized.
+ * @param[in] p the p-norm to use for normalization.
+ * @param[in] axis the axis along which to normalize.
+ * @param[in] eps small value to avoid division by zero during normalization.
+ */
+DIOPI_API diopiError_t diopiNormalize(diopiContextHandle_t ctx, diopiTensorHandle_t output, diopiConstTensorHandle_t input, const float p, const int64_t axis,
+                                      const double eps);
+/**
+ * @brief Performs the backward pass for diopiNormalize(). Computes gradients for input.
+ * @param[in] ctx Context environment.
+ * @param[out] grad_input the gradient tensor of input.
+ * @param[in] grad_output the gradient tensor of the output.
+ * @param[in] input the input tensor.
+ * @param[in] p the p-norm used during normalization.
+ * @param[in] axis the axis along which normalization was applied.
+ * @param[in] eps small value to avoid division by zero during normalization.
+ */
+DIOPI_API diopiError_t diopiNormalizeBackward(diopiContextHandle_t ctx, diopiTensorHandle_t grad_input, diopiConstTensorHandle_t grad_output,
+                                              diopiConstTensorHandle_t input, const float p, const int64_t axis, const double eps);
+
 /**
  * @brief Copies the elements from src into dest tensor.
  * @param[in] ctx Context environment.
@@ -3548,6 +4145,38 @@ DIOPI_API diopiError_t diopiTriu(diopiContextHandle_t ctx, diopiTensorHandle_t o
  */
 DIOPI_API diopiError_t diopiTriuInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, int64_t diagonal);
 
+/**
+ * @brief Create a complex tensor with real part and image part.
+ * @param[in] ctx Context environment.
+ * @param[in] real the real part of the tensor.
+ * @param[in] imag the image part of the tensor.
+ */
+DIOPI_API diopiError_t diopiComplex(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t real, diopiConstTensorHandle_t imag);
+
+/**
+ * @brief Return the complex conjugate of the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor.
+ * @param[in] out the complex conjugate of the input tensor.
+ */
+DIOPI_API diopiError_t diopiConj(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief Return the image part of the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor.
+ * @param[in] out the image part of the input tensor.
+ */
+DIOPI_API diopiError_t diopiImag(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
+/**
+ * @brief Return the real part of the input tensor.
+ * @param[in] ctx Context environment.
+ * @param[in] input the input tensor.
+ * @param[in] out the real part of the input tensor.
+ */
+DIOPI_API diopiError_t diopiReal(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
+
 /**
  * @brief Create a tensor filled with one.
  * @param[in] ctx Context environment.