@@ -36,7 +36,11 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
3636 endif ()
3737endif ()
3838
39- # Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
39+ # Process CMAKE_CUDA_ARCHITECTURES to separate standard, generic, and specific architectures.
40+ # - NVTE_STANDARD_ARCHS: pre-Blackwell archs (e.g. 75, 80, 89, 90). Applied to all CUDA sources.
41+ # - NVTE_GENERIC_ARCHS: Blackwell family heads (e.g. 100, 120). Applied to non-arch-specific sources only.
42+ # - NVTE_SPECIFIC_ARCHS: Blackwell specific targets (e.g. 100a, 120f). Applied to arch-specific sources only.
43+ set (NVTE_STANDARD_ARCHS)
4044set (NVTE_GENERIC_ARCHS)
4145set (NVTE_SPECIFIC_ARCHS)
4246
@@ -79,6 +83,10 @@ if(NOT arch_120_index EQUAL -1)
7983 endif ()
8084endif ()
8185
86+ # Move remaining standard (pre-Blackwell) architectures into NVTE_STANDARD_ARCHS.
87+ # These are applied to all CUDA sources (both generic and arch-specific).
88+ set (NVTE_STANDARD_ARCHS ${CMAKE_CUDA_ARCHITECTURES} )
89+
8290# cuDNN frontend API
8391set (CUDNN_FRONTEND_INCLUDE_DIR
8492 "${CMAKE_CURRENT_SOURCE_DIR} /../../3rdparty/cudnn-frontend/include" )
@@ -192,9 +200,13 @@ list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_s
192200 ${transformer_engine_cuda_sources}
193201 ${transformer_engine_cpp_sources} )
194202
195- # Set compile options for CUDA sources with generic architectures
203+ # Set compile options for CUDA sources with generic architectures.
204+ # These get standard archs (pre-Blackwell) + generic Blackwell family heads.
196205foreach (cuda_source IN LISTS transformer_engine_cuda_sources)
197206 set (arch_compile_options)
207+ foreach (arch IN LISTS NVTE_STANDARD_ARCHS)
208+ list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
209+ endforeach ()
198210 foreach (arch IN LISTS NVTE_GENERIC_ARCHS)
199211 list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
200212 endforeach ()
@@ -209,9 +221,14 @@ foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
209221 endif ()
210222endforeach ()
211223
212- # Set compile options for CUDA sources with specific architectures
224+ # Set compile options for CUDA sources with arch-specific features.
225+ # These get standard archs (pre-Blackwell) + Blackwell specific targets (a/f suffix).
226+ # They must NOT get generic Blackwell archs, as they use family/arch-specific PTX features.
213227foreach (cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
214228 set (arch_compile_options)
229+ foreach (arch IN LISTS NVTE_STANDARD_ARCHS)
230+ list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
231+ endforeach ()
215232 foreach (arch IN LISTS NVTE_SPECIFIC_ARCHS)
216233 list (APPEND arch_compile_options "--generate-code=arch=compute_${arch} ,code=sm_${arch} " )
217234 endforeach ()
@@ -232,6 +249,10 @@ list(APPEND transformer_engine_SOURCES
232249endif ()
233250
234251add_library (transformer_engine SHARED ${transformer_engine_SOURCES} )
252+ # Disable CMake's automatic architecture flag injection.
253+ # All architectures are handled explicitly via per-source COMPILE_OPTIONS
254+ # using NVTE_STANDARD_ARCHS, NVTE_GENERIC_ARCHS, and NVTE_SPECIFIC_ARCHS above.
255+ set_target_properties (transformer_engine PROPERTIES CUDA_ARCHITECTURES OFF )
235256target_include_directories (transformer_engine PUBLIC
236257 "${CMAKE_CURRENT_SOURCE_DIR} /include" )
237258
0 commit comments