Fix merge conflicts

AMD · AMD · commit 8f578a1057dd · 2025-10-28T17:24:02.000Z
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -288,7 +288,7 @@ case "$tag" in
     ;;
   *)
     # Catch-all for builds that are not hardcoded.
-    PROTOBUF=yes    
+    PROTOBUF=yes
     VISION=yes
     echo "image '$image' did not match an existing build configuration"
     if [[ "$image" == *py* ]]; then
@@ -460,15 +460,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
   echo "expecting triton to not be installed, but it is"
   exit 0
 fi
-<<<<<<< HEAD
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 0
-fi
-=======
->>>>>>> upstream/main
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -1,5 +1 @@
-<<<<<<< HEAD
 d704bc6e69c1a588c8edd3cbb67505d554ed65f6
-=======
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
->>>>>>> upstream/main
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
@@ -50,15 +50,11 @@ case ${DOCKER_TAG_PREFIX} in
         BASE_TARGET=rocm
         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-<<<<<<< HEAD
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
-=======
         # add gfx950, gfx115x conditionally starting in ROCm 7.0
         if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
         fi
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
->>>>>>> upstream/main
         ;;
     *)
         echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -903,13 +903,8 @@ cmake_dependent_option(
   USE_FBGEMM_GENAI
   "Whether to build FBGEMM GenAI quantized GEMM kernels.\
   Will be disabled if not supported by the platform"
-<<<<<<< HEAD
-  OFF
-  "USE_CUDA OR USE_ROCM"
-=======
   ${USE_FBGEMM_GENAI_DEFAULT}
   "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
->>>>>>> upstream/main
   OFF)
 
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -40,28 +40,6 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
-<<<<<<< HEAD
-#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
-#define IS_CUSPARSE11_AVAILABLE() 1
-#else
-#define IS_CUSPARSE11_AVAILABLE() 0
-#endif
-
-#if defined(USE_ROCM) && (ROCM_VERSION >= 70000)
-#define HIPSPARSE_FP16_SUPPORT 1
-#else
-#define HIPSPARSE_FP16_SUPPORT 0
-#endif
-
-#if defined(USE_ROCM) && (ROCM_VERSION >= 70100)
-#define HIPSPARSE_FP16_BF16_SUPPORT 1
-#else
-#define HIPSPARSE_FP16_BF16_SUPPORT 0
-#endif
-
-#if IS_CUSPARSE11_AVAILABLE()
-=======
->>>>>>> upstream/main
 #include <library_types.h>
 
 namespace at::native {
diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,5 +1,4 @@
 # Build System requirements
-<<<<<<< HEAD
 setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
 cmake>=3.31.4
 ninja==1.11.1.3
@@ -10,15 +9,4 @@ pyyaml==6.0.2
 requests==2.32.4
 six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions==4.14.1
-=======
-setuptools>=70.1.0
-cmake>=3.27
-ninja
-numpy
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.10.0
->>>>>>> upstream/main
 pip  # not technically needed, but this makes setup.py invocation work
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
@@ -21,11 +21,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
-<<<<<<< HEAD
-from torch.testing._internal.common_utils import find_free_port, skipIfRocm
-=======
 from torch.testing._internal.common_utils import find_free_port, xfailIfS390X
->>>>>>> upstream/main
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -39,11 +39,7 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
-<<<<<<< HEAD
-    skipIfRocm,
-=======
     MI350_ARCH,
->>>>>>> upstream/main
     skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
@@ -12,13 +12,7 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-<<<<<<< HEAD
-    patch_test_members,
-    NAVI3_ARCH,
-    is_arch,
-=======
     is_navi3_arch,
->>>>>>> upstream/main
     parametrize,
     patch_test_members,
     TEST_XPU,
@@ -79,11 +73,7 @@ def forward(
 )
 @instantiate_parametrized_tests
 class TestDecomposeMemMM(TestCase):
-<<<<<<< HEAD
-    def __init__(self, method_name='runTest', methodName='runTest'):
-=======
     def __init__(self, method_name="runTest", methodName="runTest"):
->>>>>>> upstream/main
         super().__init__(method_name, methodName)
         self.atol = 1e-3
         self.rtol = 1e-3
@@ -92,13 +82,9 @@ def setup_tolerance(self, rtol=None, atol=None):
         if rtol is None:
             rtol = self.rtol
         if atol is None:
-<<<<<<< HEAD
-            atol = self.rtol
-=======
             atol = self.atol
         self.rtol = rtol
         self.atol = atol
->>>>>>> upstream/main
 
     def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
@@ -107,13 +93,9 @@ def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
         for key1 in ref_dict.keys():
             key2 = "_orig_mod." + key1
             assert key2 in res_dict, f"{key1} does not exist in traced module"
-<<<<<<< HEAD
-            if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol):
-=======
             if not torch.allclose(
                 ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol
             ):
->>>>>>> upstream/main
                 return False
         return True
 
@@ -127,28 +109,20 @@ def compare_parameters(self, module, traced, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
         ref_params = dict(module.named_parameters())
         res_params = dict(traced.named_parameters())
-<<<<<<< HEAD
-        self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol))
-=======
         self.assertTrue(
             self.compare_dict_tensors(
                 ref_params, res_params, rtol=self.rtol, atol=self.atol
             )
         )
->>>>>>> upstream/main
 
     def compare_gradients(self, module, traced, rtol=None, atol=None):
         self.setup_tolerance(rtol, atol)
         ref_grad = {key: param.grad for key, param in module.named_parameters()}
         res_grad = {key: param.grad for key, param in traced.named_parameters()}
         self.assertTrue(
-<<<<<<< HEAD
-            self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol)
-=======
             self.compare_dict_tensors(
                 ref_grad, res_grad, rtol=self.rtol, atol=self.atol
             )
->>>>>>> upstream/main
         )
 
     @parametrize(
@@ -257,19 +231,12 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
 
     # We have to increase tolerance for navi3 because all fp16, bf16
     # GEMMs operations have an accuracy issue caused by hardware limitation
-<<<<<<< HEAD
-    @patch_test_members({
-        "atol": 2e-3 if is_arch(NAVI3_ARCH) else 1e-3,
-        "rtol": 2e-3 if is_arch(NAVI3_ARCH) else 1e-3
-    })
-=======
     @patch_test_members(
         {
             "atol": 2e-3 if is_navi3_arch() else 1e-3,
             "rtol": 2e-3 if is_navi3_arch() else 1e-3,
         }
     )
->>>>>>> upstream/main
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
@@ -380,19 +347,12 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose):
 
     # We have to increase tolerance for navi3 because all fp16, bf16
     # GEMMs operations have an accuracy issue caused by hardware limitation
-<<<<<<< HEAD
-    @patch_test_members({
-        "atol": 3e-3 if is_arch(NAVI3_ARCH) else 1e-3,
-        "rtol": 4e-3 if is_arch(NAVI3_ARCH) else 1e-3
-    })
-=======
     @patch_test_members(
         {
             "atol": 3e-3 if is_navi3_arch() else 1e-3,
             "rtol": 4e-3 if is_navi3_arch() else 1e-3,
         }
     )
->>>>>>> upstream/main
     @parametrize(
         "m,k,n, should_decompose",
         [(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -1302,12 +1302,7 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
-<<<<<<< HEAD
-    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
-=======
     @skipIfRocmArch(NAVI_ARCH)
->>>>>>> upstream/main
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -1362,15 +1357,10 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
-<<<<<<< HEAD
-    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
-=======
     @unittest.skipIf(
         config.triton.native_matmul,
         "native matmul and Triton template both have accuracy fail (2.2%)",
     )
->>>>>>> upstream/main
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 2048), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((2048, 768), (768, 1), device=GPU_TYPE)
diff --git a/test/test_linalg.py b/test/test_linalg.py
@@ -22,23 +22,13 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
-<<<<<<< HEAD
-     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
-     runOnRocmArch, MI300_ARCH)
-=======
      skipIfRocmArch, setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
      runOnRocmArch, MI300_ARCH, NAVI_ARCH, TEST_CUDA)
->>>>>>> upstream/main
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
      onlyCPU, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
-<<<<<<< HEAD
-     onlyCUDA, skipCUDAVersionIn, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
-     dtypesIfMPS, largeTensorTest)
-=======
      onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, dtypesIfMPS, largeTensorTest)
->>>>>>> upstream/main
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
@@ -5712,8 +5702,6 @@ def test_blaslog_tunableop(self, device, dtype):
                 # BLAS PARAMS
                 self.assertTrue("{ function:" in first_row[4])
 
-<<<<<<< HEAD
-=======
     @onlyCUDA
     @skipCUDAIfNotRocm
     @dtypes(torch.float)
@@ -6058,7 +6046,6 @@ def test_numerical_check_accuracy_tunableop(self, device, dtype):
             self.assertTrue(torch.allclose(C_baseline, C_numeric, atol=atol, rtol=rtol))
 
 
->>>>>>> upstream/main
     @dtypes(torch.float, torch.complex64)
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty((256, 512), device=device, dtype=dtype, requires_grad=True).unsqueeze(0)
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py