Skip to content

Commit 65eb86b

Browse files
committed
fix CUDA architectures cmake logic
1 parent 9d77dcb commit 65eb86b

File tree

1 file changed

+24
-3
lines changed

1 file changed

+24
-3
lines changed

transformer_engine/common/CMakeLists.txt

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
3636
endif()
3737
endif()
3838

39-
# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
39+
# Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures.
40+
# - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources.
41+
# - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only.
42+
# - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only.
43+
set(NVTE_STANDARD_ARCHS)
4044
set(NVTE_GENERIC_ARCHS)
4145
set(NVTE_SPECIFIC_ARCHS)
4246

@@ -79,6 +83,10 @@ if(NOT arch_120_index EQUAL -1)
7983
endif()
8084
endif()
8185

86+
# Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS.
87+
# These are applied to all CUDA sources (both generic and arch-specific).
88+
set(NVTE_STANDARD_ARCHS ${CMAKE_CUDA_ARCHITECTURES})
89+
8290
# cuDNN frontend API
8391
set(CUDNN_FRONTEND_INCLUDE_DIR
8492
"${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
@@ -192,9 +200,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s
192200
${transformer_engine_cuda_sources}
193201
${transformer_engine_cpp_sources})
194202

195-
# Set compile options for CUDA sources with generic architectures
203+
# Set compile options for CUDA sources with generic architectures.
204+
# These get standard archs (pre-Blackwell) + generic Blackwell family heads.
196205
foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
197206
set(arch_compile_options)
207+
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
208+
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
209+
endforeach()
198210
foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
199211
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
200212
endforeach()
@@ -209,9 +221,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
209221
endif()
210222
endforeach()
211223

212-
# Set compile options for CUDA sources with specific architectures
224+
# Set compile options for CUDA sources with arch-specific features.
225+
# These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix).
226+
# They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features.
213227
foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
214228
set(arch_compile_options)
229+
foreach(arch IN LISTS NVTE_STANDARD_ARCHS)
230+
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
231+
endforeach()
215232
foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
216233
list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
217234
endforeach()
@@ -232,6 +249,10 @@ list(APPEND transformer_engine_SOURCES
232249
endif()
233250

234251
add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
252+
# Disable CMake's automatic architecture flag injection.
253+
# All architectures are handled explicitly via per-source COMPILE_OPTIONS
254+
# using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above.
255+
set_target_properties(transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF)
235256
target_include_directories(transformer_engine PUBLIC
236257
"${CMAKE_CURRENT_SOURCE_DIR}/include")
237258

0 commit comments

Comments
 (0)