diff --git a/.github/workflows/_runs-on-nv-step1.yml b/.github/workflows/_runs-on-nv-step1.yml index 79b7ebc840..8943736fa1 100644 --- a/.github/workflows/_runs-on-nv-step1.yml +++ b/.github/workflows/_runs-on-nv-step1.yml @@ -55,6 +55,37 @@ jobs: """ fi + # open job after dynamic torch ready (with out unique + gen diopi suffix lib) + Build-torch-dynamic: + if: false + name: Build-torch-dynamic + runs-on: ${{ inputs.runner }} + env: + GETRUNNER: ${{ inputs.runner }} + DEEPLINK_PATH: ${{ inputs.deeplink_path }} + ENV_PATH: ${{ inputs.env_path }} + SLURM_PAR_V100: "pat_dev" + BUILD_TEST2: "build_test_dyn" + steps: + - name: build + run: | + if [[ "${GETRUNNER}" == *sco* ]];then + set -e + cd ${DEEPLINK_PATH}/ && ls -al && find ${DEEPLINK_PATH}/ -maxdepth 1 -mmin +300 -type d |xargs rm -rf + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${BUILD_TEST2} && cp -R source ${BUILD_TEST2} && cd ${BUILD_TEST2} + srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST2} \ + && source ${ENV_PATH}/pt2.0_diopi \ + && cd impl && bash scripts/build_impl.sh torch_dyload" || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${BUILD_TEST2} && exit 1 ) + else + ssh SH1424 """ + set -e + cd ${DEEPLINK_PATH}/ && ls -al && find ${DEEPLINK_PATH}/ -maxdepth 1 -mmin +300 -type d |xargs rm -rf + source ${ENV_PATH}/pt2.0_diopi + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${BUILD_TEST2} && cp -R source ${BUILD_TEST2} && cd ${BUILD_TEST2} + srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_V100} --time=20 bash -c 'cd impl && bash scripts/build_impl.sh torch_dyload' || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${BUILD_TEST2} && exit 1 ) + """ + fi + Gen-Data: name: Gen-Data needs: [Build-Nvidia] @@ -82,7 +113,7 @@ jobs: set -e source ${ENV_PATH}/pt2.0_diopi cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && cd ${BUILD_TEST1} && cd diopi_test/python && - srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_V100} --time=20 --gres=gpu:1 bash -c 'python main.py --mode gen_data' \ + srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_V100} --time=30 --gres=gpu:1 bash -c 'python main.py --mode gen_data' \ || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST1} && git clean -xdf ${GEN_DATA} && exit 1 ) """ else diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6aee01f4ac..60addbeab9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -205,7 +205,7 @@ jobs: set -e source /mnt/cache/share/platform/env/${ENV_NAME} cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && cd ${BUILD_TEST1}_A100 && cd diopi_test/python && ls && - srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_SH1424} --time=20 --gres=gpu:${GPU_REQUESTS} bash -c 'python main.py --mode gen_data' \ + srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_SH1424} --time=30 --gres=gpu:${GPU_REQUESTS} bash -c 'python main.py --mode gen_data' \ || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST1}_A100 && git clean -xdf ${GEN_DATA} && exit 1 ) """ - name: test-op @@ -215,7 +215,7 @@ jobs: source /mnt/cache/share/platform/env/${ENV_NAME} && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && cd ${BUILD_TEST1}_A100 export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST1}_A100/impl/lib echo \$LD_LIBRARY_PATH - srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_SH1424} --time=20 --gres=gpu:${GPU_REQUESTS} bash -c 'cd diopi_test/python && python main.py --mode gen_case && python main.py --mode run_test' \ + srun --job-name=${GITHUB_JOB} --partition=${SLURM_PAR_SH1424} --time=30 --gres=gpu:${GPU_REQUESTS} bash -c 'cd diopi_test/python && python main.py --mode gen_case && python main.py --mode run_test' \ && git clean -xdf ${GEN_DATA} || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST1}_A100 && git clean -xdf ${GEN_DATA} && exit 1 ) """ diff --git a/adaptor/codegen/gen.py b/adaptor/codegen/gen.py index a3cd520853..af8727d8b4 100644 --- a/adaptor/codegen/gen.py +++ b/adaptor/codegen/gen.py @@ -196,7 +196,9 @@ def prepare() -> Tuple[dict, str]: impl_plugin = options.impl_plugin base_device = options.base_device - + assert(base_device is None or base_device == "" or base_device == "torch", f"invalid base_device:{base_device}") + if base_device == "": + base_device = None def create_if_not_exist(name): if not os.path.exists(name): os.makedirs(name) @@ -758,7 +760,7 @@ def gen_base_device_impl_funcs(device: str, base_device: str, dirs: dict, impl_f impl_basedev_functions = get_all_impl_functions(base_device_impl_dir) # remove ops already exist in device impl. impl_basedev_functions = {op: args for op, args in impl_basedev_functions.items() if op not in impl_functions} - + funcs_info, funcs_decl_raw = get_functions_support(dirs.get("source")) func_base_decl = get_impl_funcs_declaration( funcs_decl_raw, funcs_info, impl_basedev_functions.keys(), True, diff --git a/impl/cmake/ImplHelper.cmake b/impl/cmake/ImplHelper.cmake new file mode 100644 index 0000000000..467801fbb0 --- /dev/null +++ b/impl/cmake/ImplHelper.cmake @@ -0,0 +1,48 @@ +function(diopi_use_adapter adaptor_dir diopi_impl_dir config_device base_device + out_src_files) + # NB: all augements passed by func parameters instead of global variables. + file(GLOB ADAPTOR_TEMPLATE_CODE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${adaptor_dir}/codegen/*.py) + add_custom_target(adaptor_gen_dependency DEPENDS ${ADAPTOR_TEMPLATE_CODE}) + set(ADAPTOR_CSRC_PATH "${ADAPTOR_DIR}/csrc") + + set(ADAPTER_GEN_FILES ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/impl_functions.hpp) + add_custom_target(adaptor_code_gen + COMMAND python3 ${ADAPTOR_DIR}/codegen/gen.py --diopi_dir=${diopi_impl_dir}/../ --output_dir=${ADAPTOR_CSRC_PATH} --config_device=${config_device} --base_device=${base_device} + BYPRODUCTS ${ADAPTER_GEN_FILES} + DEPENDS adaptor_gen_dependency + VERBATIM + ) + list(APPEND ${out_src_files} ${ADAPTOR_CSRC_PATH}/convert.cpp ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/composite_ops.cpp) + set(${out_src_files} ${${out_src_files}} PARENT_SCOPE) +endfunction() + + +function(prep_dyn_load diopi_impl_dir device_impl) + set(DYN_GEN_FILE ${CMAKE_BINARY_DIR}/src/impl/wrap_function.cpp) + set(DYN_HELP_DIR ${diopi_impl_dir}/scripts/dyn_load_helper) + file(GLOB DYN_GEN_DEPS ${DYN_HELP_DIR}/dyn_wrap_gen.py) + + add_custom_target(dyn_wrap_gen ALL + COMMAND python ${DYN_HELP_DIR}/dyn_wrap_gen.py -o ${DYN_GEN_FILE} + DEPENDS ${DYN_GEN_DEPS} + BYPRODUCTS ${DYN_GEN_FILE} + WORKING_DIRECTORY ${DYN_HELP_DIR}) + set(DYN_SRC ${DYN_GEN_FILE} ${DYN_HELP_DIR}/dyn_helper.cpp) + + add_library(${device_impl} SHARED ${DYN_SRC}) + target_link_libraries(${device_impl} -ldl) + target_include_directories(${device_impl} PRIVATE ${DYN_HELP_DIR}) + add_dependencies(${device_impl} dyn_wrap_gen) +endfunction() + +function(handle_dyn_torch diopi_impl_dir real_impl torch_dir device_impl) + add_custom_target(dyn_torch + COMMAND ${diopi_impl_dir}/scripts/dyn_load_helper/dyn_torch_handler.sh patch_diopi + ${LIBRARY_OUTPUT_PATH} ${torch_dir}/lib + DEPENDS ${real_impl} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + message(STATUS "handle_dyn_torch with torch: ${torch_dir}") + add_dependencies(${device_impl} dyn_torch) +endfunction() + diff --git a/impl/cmake/TorchBaseFunc.cmake b/impl/cmake/TorchBaseFunc.cmake new file mode 100644 index 0000000000..b340e8c079 --- /dev/null +++ b/impl/cmake/TorchBaseFunc.cmake @@ -0,0 +1,28 @@ + +macro(diopi_find_torch) + execute_process( + COMMAND sh -c "python -c 'import torch;print(torch.utils.cmake_prefix_path)'" + OUTPUT_VARIABLE DIOPI_TORCH_CMAKE_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(DIOPI_TORCH_CMAKE_PREFIX) + # this config is appened to existing CMAKE_PREFIX_PATH and not overwrite + # user provided CMAKE_PREFIX_PATH. + list(APPEND CMAKE_PREFIX_PATH ${DIOPI_TORCH_CMAKE_PREFIX}) + endif() + message(STATUS "diopi CMAKE_PREFIX_PATH:${CMAKE_PREFIX_PATH}") + + + find_package(Torch REQUIRED) + if (Torch_FOUND) + message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}") + message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") + add_definitions(-DTORCH_VERSION_MAJOR=${Torch_VERSION_MAJOR}) + add_definitions(-DTORCH_VERSION_MINOR=${Torch_VERSION_MINOR}) + add_definitions(-DTORCH_VERSION_PATCH=${Torch_VERSION_PATCH}) + add_definitions(-DTORCH_VERSION=${Torch_VERSION}) + message(STATUS "Found Torch Version: ${Torch_VERSION}") + endif() + +endmacro() diff --git a/impl/muxi/CMakeLists.txt b/impl/muxi/CMakeLists.txt index ea7aee9f35..fe247f36f2 100644 --- a/impl/muxi/CMakeLists.txt +++ b/impl/muxi/CMakeLists.txt @@ -4,41 +4,30 @@ project(muxi_impl) add_compile_definitions(USE_MACA=1) set(USE_MACA ON) -set(BASE_TORCH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../torch") -include(${BASE_TORCH_DIR}/cmake/TorchBaseFunc.cmake) -InitFindTorch() +include(../cmake/ImplHelper.cmake) +include(../cmake/TorchBaseFunc.cmake) +diopi_find_torch() -find_package(Torch REQUIRED) -if (Torch_FOUND) - message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}") - message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}") - - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") - add_definitions(-DTORCH_VERSION_MAJOR=${Torch_VERSION_MAJOR}) - add_definitions(-DTORCH_VERSION_MINOR=${Torch_VERSION_MINOR}) - add_definitions(-DTORCH_VERSION_PATCH=${Torch_VERSION_PATCH}) - add_definitions(-DTORCH_VERSION=${Torch_VERSION}) - message(STATUS "Found Torch Version: ${Torch_VERSION}") -endif() +# TODO: Report bugs to muxi +# There has conflict when muxi runtime used together with pip installed torch_cpu. +# so to use dipu with torch_cpu in muxi, maunual compile torch cpu with export BLAS=OpenBLAS. + +set(BASE_TORCH_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../torch") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") file(GLOB REAL_IMPL_SRC - ${BASE_TORCH_DIR}/functions/error.cpp - ${BASE_TORCH_DIR}/functions/functions.cpp - - ${BASE_TORCH_DIR}/functions/functions_lightllm.cpp - ${BASE_TORCH_DIR}/functions/functions_mmcv.cpp - ${BASE_TORCH_DIR}/helper.cpp - ${BASE_TORCH_DIR}/functions/functions_mmcv/*.cu - - ${BASE_TORCH_DIR}/functions/functions_ext.cpp - ${BASE_TORCH_DIR}/functions/functions_ext/*.cu - - ${BASE_TORCH_DIR}/build_aten.cpp - - # mx cpp - functions/functions.cpp + ${BASE_TORCH_DIR}/functions/error.cpp + ${BASE_TORCH_DIR}/functions/functions.cpp + ${BASE_TORCH_DIR}/functions/functions_lightllm.cpp + ${BASE_TORCH_DIR}/functions/functions_mmcv.cpp + ${BASE_TORCH_DIR}/helper.cpp + ${BASE_TORCH_DIR}/functions/functions_mmcv/*.cu + ${BASE_TORCH_DIR}/functions/functions_ext.cpp + ${BASE_TORCH_DIR}/functions/functions_ext/*.cu + ${BASE_TORCH_DIR}/build_aten.cpp + # mx cpp + functions/functions.cpp ) # adaptor @@ -48,28 +37,27 @@ if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/convert_config.yaml") endif() if(USE_ADAPTOR) - # dependency - file(GLOB ADAPTOR_TEMPLATE_CODE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${ADAPTOR_DIR}/codegen/*.py) - add_custom_target(adaptor_gen_dependency DEPENDS ${ADAPTOR_TEMPLATE_CODE}) + diopi_use_adapter(${ADAPTOR_DIR} ${DIOPI_IMPL_DIR} "torch" "" REAL_IMPL_SRC) +endif() - set(ADAPTOR_CSRC_PATH "${ADAPTOR_DIR}/csrc") - set(GEN_FILES ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/impl_functions.hpp) - add_custom_target(adaptor_code_gen - COMMAND python3 ${ADAPTOR_DIR}/codegen/gen.py --diopi_dir=${DIOPI_IMPL_DIR}/../ --output_dir=${ADAPTOR_CSRC_PATH} - --config_device=muxi --base_device=torch - BYPRODUCTS ${GEN_FILES} - DEPENDS adaptor_gen_dependency) - list(APPEND REAL_IMPL_SRC ${ADAPTOR_CSRC_PATH}/convert.cpp ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/composite_ops.cpp) +if(${DYLOAD}) + prep_dyn_load(${DIOPI_IMPL_DIR} ${DEVICEIMPL}) + set(REAL_IMPL diopi_real_impl) +else() + set(REAL_IMPL ${DEVICEIMPL}) endif() -cuda_add_library(${DEVICEIMPL} SHARED ${REAL_IMPL_SRC}) -target_link_libraries(${DEVICEIMPL} ${TORCH_LIBRARIES}) +cuda_add_library(${REAL_IMPL} SHARED ${REAL_IMPL_SRC}) +target_link_libraries(${REAL_IMPL} ${TORCH_LIBRARIES}) add_subdirectory(functions/functions_ext/flash-attention) -target_link_libraries(${DEVICEIMPL} diopi_torch_ext_flash_attn) -target_include_directories(${DEVICEIMPL} PRIVATE ${BASE_TORCH_DIR}) +target_link_libraries(${REAL_IMPL} diopi_torch_ext_flash_attn) +target_include_directories(${REAL_IMPL} PRIVATE ${BASE_TORCH_DIR}) if(USE_ADAPTOR) - add_dependencies(${DEVICEIMPL} adaptor_code_gen) + add_dependencies(${REAL_IMPL} adaptor_code_gen) +endif() +if(${DYLOAD}) + handle_dyn_torch(${DIOPI_IMPL_DIR} ${REAL_IMPL} ${TORCH_INSTALL_PREFIX} ${DEVICEIMPL}) endif() if (TEST) diff --git a/impl/muxi/test/CMakeLists.txt b/impl/muxi/test/CMakeLists.txt index 4b12592047..5c4526b6d6 100644 --- a/impl/muxi/test/CMakeLists.txt +++ b/impl/muxi/test/CMakeLists.txt @@ -15,7 +15,7 @@ set(RUNTIME_SRC # use torch cuda runtime ${BASE_TORCH_DIR}/test/conform_test.cpp ) -add_library(diopirt SHARED ${RUNTIME_SRC}) +cuda_add_library(diopirt SHARED ${RUNTIME_SRC}) message(STATUS "test diopirt CUDA_LIBRARIES is:" ${CUDA_LIBRARIES}) target_link_libraries(diopirt ${CUDA_LIBRARIES}) diff --git a/impl/scripts/build_impl.sh b/impl/scripts/build_impl.sh index 2c91e9a08b..3fc82cff94 100644 --- a/impl/scripts/build_impl.sh +++ b/impl/scripts/build_impl.sh @@ -41,6 +41,12 @@ case $1 in -DENABLE_COVERAGE=${USE_COVERAGE} make_maca -j8 ;; + muxi_dyload) + mkdir -p build && cd build + cmake_maca .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DIMPL_OPT=muxi -DCMAKE_BUILD_TYPE=Release -DDYLOAD=ON -DTEST=ON \ + && make_maca -j8 + mkdir -p ${DIOPI_TEST_PATH}/lib && ln -sf ${CURRENT_DIR}/../lib/libdiopi_real_impl.so ${DIOPI_TEST_PATH}/lib + ;; camb_pytorch) mkdir -p build && cd build cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DIMPL_OPT=camb_pytorch -DCMAKE_BUILD_TYPE=Release -DTEST=ON \ diff --git a/impl/scripts/dyn_load_helper/dyn_helper.cpp b/impl/scripts/dyn_load_helper/dyn_helper.cpp new file mode 100644 index 0000000000..d16717c68c --- /dev/null +++ b/impl/scripts/dyn_load_helper/dyn_helper.cpp @@ -0,0 +1,32 @@ +#include + +#include +#include +#include + +void* dynLoadFile(const char* diopiRealName) { + namespace fs = std::filesystem; + void* handle = dlopen(diopiRealName, RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND); + if (!handle) { + Dl_info info; + if (dladdr(reinterpret_cast(dynLoadFile), &info) != 0 && info.dli_fname != nullptr) { + fs::path fpath(info.dli_fname); + auto diopiInLoader = fpath.parent_path().append(diopiRealName).string(); + printf( + "diopi dyload fail, seems LD_LIBRARAY_PATH not contains %s, try to load " + "from loader current dir's %s \n", + diopiRealName, + diopiInLoader.c_str()); + + handle = dlopen(diopiInLoader.c_str(), RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND); + } + } + if (!handle) { + fprintf(stderr, + "! please note that dynamic loaded diopi_impl.so need explictly link to it's \ + diopi_rt (now is torch_dipu), so it cannot be used for diopi-test now \n"); + fprintf(stderr, "%s \n", dlerror()); + throw std::runtime_error("diopi_init err"); + } + return handle; +} diff --git a/impl/scripts/dyn_load_helper/dyn_helper.hpp b/impl/scripts/dyn_load_helper/dyn_helper.hpp new file mode 100644 index 0000000000..2612dca40c --- /dev/null +++ b/impl/scripts/dyn_load_helper/dyn_helper.hpp @@ -0,0 +1,3 @@ +#pragma once + +void* dynLoadFile(const char* diopiRealName); diff --git a/impl/scripts/dyn_load_helper/dyn_torch_handler.sh b/impl/scripts/dyn_load_helper/dyn_torch_handler.sh new file mode 100755 index 0000000000..5fd10e6b35 --- /dev/null +++ b/impl/scripts/dyn_load_helper/dyn_torch_handler.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -eo pipefail + +# pip install patchelf + +diopi_suffix=".diopi" +torch_raws=("libtorch.so" "libtorch_cuda.so" "libtorch_cpu.so" "libc10_cuda.so" "libc10.so") +torch_4diopi=() +for ((i=0; i<${#torch_raws[@]}; i++)); do + torch_4diopi[i]=${torch_raws[$i]}${diopi_suffix} +done + +# even using RTLD_DEEPBIND load, same name lib still has only one instance in the address space. +# RTLD_DEEPBIND just make same symbol name in different lib names be loaded as 2 instance but not +# symbol in same lib name. +function gen_versioned_torch() { + for ((i=0; i<${#torch_raws[@]}; i++)); do + cp ${torch_raws[$i]} ${torch_4diopi[$i]} + done + + for ((i=0; i<${#torch_4diopi[@]}; i++)); do + libi=${torch_4diopi[$i]} + replace_items="" + for ((depIdx=i+1; depIdx<${#torch_4diopi[@]}; depIdx++)); do + dep_raw=${torch_raws[$depIdx]} + dep_4diopi=${torch_4diopi[$depIdx]} + replace_items=${replace_items}" --replace-needed ${dep_raw} ${dep_4diopi}" + + done + patchelf ${replace_items} --set-soname ${libi} ${libi} + done +} + +function check_correct_torch() { + echo "check diopi torch: $1" + + # TODO: use an elf lib to remove unqiue flag in *so.diopi + # remove unique symbols of both cpu torch (dipu use) and device torch (diopi use). + # if you device torch is compiled by clang, which not supporting -fno-gnu-unique, + # just test if it works (eg: muxi torch with unique can coexist with no-uniqued cpu torch) + + echo "please check if you torch builded with -fno-gnu-unique to support multi version torch coexist" + set +e + chk_ret=`cd $1 && ls -alh |grep .*\.so\.diopi | wc -l` + set -e + if [[ ${chk_ret} -ne ${#torch_4diopi[@]} ]]; then + echo "ret value: ${chk_ret}, in device-torch dir, not find dyn-load needed XX.so.diopi libs!" + echo "!! please manual run handle_dyload_torch.sh patch_torch {device_torch_dir} to gen dyn-load needed multi-version torch" + exit -1 + fi + + echo "diopi torch version check ok" +} + +function patch_diopi_torch() { + removed_items="" + added_items="" + for ((i=0; i<${#torch_4diopi[@]}; i++)); do + dep_raw=${torch_raws[$i]} + dep_4diopi=${torch_4diopi[$i]} + removed_items=${removed_items}" --remove-needed ${dep_raw}" + added_items=${added_items}" --add-needed ${dep_4diopi}" + done + patchelf ${removed_items} libdiopi_real_impl.so + patchelf ${added_items} libdiopi_real_impl.so +} + +# 1.because dipu libs are loaded by python using dlload. so relative to python main, real_diopi_libs are +# 2-hop dynamic loaded. it cannot see first-hop loaded libs like torch_dipu (unless the lib is loaded +# using RTLD_GLOBAL, but it's not used when directly loading python lib). so diopi need maunal link +# torch_dipu.so lib. +# 2.although both the 1st hop dynamic-loaded lib and the 2ed's link to torch_dipu.so, they still share +# the same lib instance in addr space. +function patch_diopi_dipu() { + patchelf --remove-needed libtorch_dipu.so libdiopi_real_impl.so + patchelf --add-needed libtorch_dipu.so libdiopi_real_impl.so +} + +function remove_unique_symbol() { + group="torch_$1" # raws or 4diopi + place="${2:-.}" + array="$group[@]" + for name in "${!array}"; do + echo "PATCH $name" + python elffile_remove_unique.py -i "${place%%/}/$name" + done +} + +WORK_DIR=$2 +cd ${WORK_DIR} +if [[ "$1" == "patch_torch" ]]; then + gen_versioned_torch +elif [[ "$1" == "patch_diopi" ]]; then + check_correct_torch $3 + # in dipoi link lib list, torch_dipu.so must be placed behind torch_XX libs. + # because both dipu and inner 'DEEPBIND' torch_cpu call Library.fallback() which is + # a template function instantiated when parameter types CppFunction is first called + # (!!! not directly link to the template definition in external torch_cpu.so !!!). + # if torch_dipu.so is linked in front, fallback() symbol is bind to the symbol + # in torch_dipu.so which use external torch template class that cannot work with inner torch CppFunction. + patch_diopi_dipu + patch_diopi_torch +elif [[ "$1" == "rm_unique_raw" ]]; then + remove_unique_symbol raws +elif [[ "$1" == "rm_unique_diopi" ]]; then + remove_unique_symbol 4diopi +fi diff --git a/impl/torch/code_gen.py b/impl/scripts/dyn_load_helper/dyn_wrap_gen.py similarity index 69% rename from impl/torch/code_gen.py rename to impl/scripts/dyn_load_helper/dyn_wrap_gen.py index dbe3df60cf..be8dbf9924 100644 --- a/impl/torch/code_gen.py +++ b/impl/scripts/dyn_load_helper/dyn_wrap_gen.py @@ -1,5 +1,6 @@ # Copyright (c) 2023, DeepLink. import os +import argparse # CONVERT # # DIOPI_API diopiError_t diopiBmm(diopiContextHandle_t ctx, diopiTensorHandle_t out, @@ -14,36 +15,26 @@ # } new_content = [] -new_content.append('/**\n\ - * @file\n\ - * @author DeepLink\n\ - * @copyright (c) 2023, DeepLink.\n\ - */\n\ -#include \n\ -#include \n\ -#include \n\ -#include \n\ -#include \n\ -\n\ -static void* handle;\n\ -\n\ -static void\n\ -__attribute__ ((constructor))\n\ -diopi_init(void) {\n\ - handle = dlopen("libdiopi_real_impl.so", RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND);\n\ - printf("diopi dyload init\\n");\n\ - if (!handle) {\n\ - fprintf (stderr, "%s ", dlerror());\n\ - }\n\ -}\n\ -\n\ -static void\n\ -__attribute__ ((destructor))\n\ -diopi_fini(void)\n\ -{\n\ -dlclose(handle);\n\ -}\n\ -\n') +new_content.append(''' +/** + * @file + * @author DeepLink + * @copyright (c) 2023, DeepLink. + */ +#include +#include +#include +#include +#include + +#include + +static void* handle; +const static char* diopiFile = "libdiopi_real_impl.so"; +static void __attribute__((constructor)) diopi_init() { handle = dynLoadFile(diopiFile); } +static void __attribute__((destructor)) diopi_fini() { dlclose(handle); } + +''') def get_func_arg(content): @@ -106,7 +97,7 @@ def gen_wrapper_func(content): new_content.append(args) new_content.append(" " + 'func = reinterpret_cast(dlsym(handle, "' + func_name + '"));\n') - new_content.append(" " + "if (func != NULL) {\n") + new_content.append(" " + "if (func != nullptr) {\n") new_content.append(" " + " return (*func)" + arg + ";\n") new_content.append(" " + "} else {\n") new_content.append(" " + " printf(\"[wrap_func] %s not implemented!\\n\", \"" + func_name + "\");\n") @@ -118,25 +109,38 @@ def gen_wrapper_func(content): new_content.append("}\n") new_content.append("\n") + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generate DIOPI adaptor source files" + ) + parser.add_argument( + "-o", + "--output_file", + help="output generated dynamic call file", + ) + args = parser.parse_args() + return args + +op_header_files = ['functions.h', 'functions_mmcv.h', 'functions_ext.h'] + if __name__ == '__main__': - print("open functions.h") - _cur_dir = os.path.dirname(os.path.abspath(__file__)) - with open(os.path.join(_cur_dir, '../proto/include/diopi/functions.h'), 'r')as f: - content = f.readlines() - print("generate for functions.h") - gen_wrapper_func(content) - print("open functions_mmcv.h") - with open(os.path.join(_cur_dir, '../proto/include/diopi/functions_mmcv.h'), 'r') as f: - content_mmcv = f.readlines() - print("generate for functions_mmcv.h") - gen_wrapper_func(content_mmcv) - with open(os.path.join(_cur_dir, '../proto/include/diopi/functions_ext.h'), 'r') as f: - content_ext = f.readlines() - print("generate for functions_ext.h") - gen_wrapper_func(content_ext) - os.system("rm -f wrap_func.cpp") - print("generate wrap_func.cpp") - with open('wrap_func.cpp', 'w') as f: + args = parse_args() + file_dir = os.path.dirname(os.path.abspath(__file__)) + protodir = file_dir + "/../../../proto/include/diopi/" + for fname in op_header_files: + with open(os.path.join(protodir, fname), 'r')as f: + content = f.readlines() + print(f"generate for {fname}") + gen_wrapper_func(content) + + + print(f"generate {args.output_file}") + out_dir = os.path.dirname(args.output_file) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + + with open(args.output_file, 'w') as f: for row in new_content: f.write(row) print("finish codegen") diff --git a/impl/scripts/dyn_load_helper/elffile_remove_unique.py b/impl/scripts/dyn_load_helper/elffile_remove_unique.py new file mode 100644 index 0000000000..e2f2a69ca9 --- /dev/null +++ b/impl/scripts/dyn_load_helper/elffile_remove_unique.py @@ -0,0 +1,90 @@ +import argparse +import shutil +import sys +from elftools.elf.elffile import ELFFile + + +def patch(input: str, output: str | None, *, verbose: bool = False) -> None: + # https://llvm.org/doxygen/BinaryFormat_2ELF_8h_source.html#l01291 + # https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.symtab.html + # https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html + + ELF64_SYM_SIZE = 24 # size of 64 bit struct. + ST_INFO_OFFSET = 4 # size of Elf64_Word (uint32_t) + STB_GNU_UNIQUE = 10 + STB_WEAK = 2 + STV_HIDDEN = 2 + + def check(file): + # Also we may debug with: hexdump -C -n 20 FILE_PATH + ELFCLASS64 = 2 # 64-bit objects + ET_DYN = 3 # shared object file + + read_int = lambda nbytes: int.from_bytes(file.read(nbytes), "little") + file.seek(4) # offset EI_CLASS + assert read_int(1) == ELFCLASS64, "only x64 library are supported" + file.seek(16) # offset EI_NIDENT + assert read_int(2) == ET_DYN, "only dynamic library are supported" + file.seek(0) # reset + + address = [] # address of st_info + with open(input, "rb") as file: + check(file) + elf = ELFFile(file) + address = [] + section = elf.get_section_by_name( + ".dynsym" + ) # symtab ignored as we are using dlopen + for index, symbol in enumerate(section.iter_symbols()): + # st_info consists of bind (higher 4 bit) and type (lower 4 bit) + if symbol.entry.st_info.bind in ["STB_GNU_UNIQUE", "STB_LOOS"]: + # typedef struct { + # Elf64_Word st_name; + # unsigned char st_info; <--- here + # unsigned char st_other; + # Elf64_Half st_shndx; + # Elf64_Addr st_value; + # Elf64_Xword st_size; + # } Elf64_Sym; + offset = ( + section.header.sh_addr + index * ELF64_SYM_SIZE + ST_INFO_OFFSET + ) + address.append(offset) + + if verbose: + print(f"Found UNIQUE 0x{address[-1]:08x}: {symbol.name}") + + print(f"Found {len(address)} symbol(s)") + if output is None: + print(f"Patch inplace: {input}") + else: + print(f"Output to: {output}") + + output = ( + input + if output is None + else shutil.copyfile(input, output, follow_symlinks=False) + ) + with open(output, "r+b") as file: + for offset in address: + if verbose: + print(f"Patch UNIQUE 0x{offset:08x}") + + file.seek(offset) + info = int.from_bytes(file.read(1), "little") + assert (info >> 4) == STB_GNU_UNIQUE + + file.seek(offset) + file.write(bytes([(STB_WEAK << 4) | (info & 0xF), STV_HIDDEN])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Replace STB_GNU_UNIQUE or STB_LOOS with STB_WEAK in shared libraries." + ) + parser.add_argument("-i", "--input", required=True, type=str) + parser.add_argument("-o", "--output", type=str, default=None) + parser.add_argument("-v", "--verbose", action="store_true", default=False) + parser.set_defaults(call=patch) + args = parser.parse_args(sys.argv[1:]) + args.call(args.input, args.output, verbose=args.verbose) diff --git a/impl/torch/CMakeLists.txt b/impl/torch/CMakeLists.txt index 9fdcc1c7d1..852a062698 100644 --- a/impl/torch/CMakeLists.txt +++ b/impl/torch/CMakeLists.txt @@ -4,36 +4,15 @@ project(torch_impl) option(HIP "Whether to use HIP when available" OFF) option(DIOPI_TORCH_UNSAFE_BUILDATEN "Whether to use fast but unsafe buildATen (caution: only use this with DIPU)" OFF) -include(cmake/TorchBaseFunc.cmake) -InitFindTorch() - -if (NOT Torch_FOUND) - find_package(Torch 2.0 REQUIRED) -endif() -if (Torch_FOUND) - message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") - add_definitions(-DTORCH_VERSION_MAJOR=${Torch_VERSION_MAJOR}) - add_definitions(-DTORCH_VERSION_MINOR=${Torch_VERSION_MINOR}) - add_definitions(-DTORCH_VERSION_PATCH=${Torch_VERSION_PATCH}) - add_definitions(-DTORCH_VERSION=${Torch_VERSION}) - message(STATUS "Found Torch Version: ${Torch_VERSION}") -else() - message(FATAL_ERROR "Libtorch version 1.10 is required, no suitable version was found") -endif() +include(../cmake/ImplHelper.cmake) +include(../cmake/TorchBaseFunc.cmake) +diopi_find_torch() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") list(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__;-std=c++17") - -if (DYLOAD) - # creat an empty file to pass wrap_func.cpp's existence check - # one can change code_gen.py or wrap_func.cpp to recompile once wrap_func.cpp built - execute_process(COMMAND touch ${CMAKE_CURRENT_SOURCE_DIR}/wrap_func.cpp) - add_custom_target(code_gen COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/code_gen.py - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) - set(IMPL_SRC wrap_func.cpp) -endif() +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) file(GLOB REAL_IMPL_SRC RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} functions/functions_sparse/*.cu functions/functions_mmcv/*.cu functions/functions_ext/*.cu functions/*.cpp helper.cpp build_aten.cpp) @@ -44,24 +23,17 @@ if(NOT EXISTS "${PROJECT_SOURCE_DIR}/convert_config.yaml") endif() if(USE_ADAPTOR) - # dependency - file(GLOB ADAPTOR_TEMPLATE_CODE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${ADAPTOR_DIR}/codegen/*.py) - add_custom_target(adaptor_gen_dependency DEPENDS ${ADAPTOR_TEMPLATE_CODE}) - - set(ADAPTOR_CSRC_PATH "${ADAPTOR_DIR}/csrc") - set(GEN_FILES ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/impl_functions.hpp) - add_custom_target(adaptor_code_gen - COMMAND python3 ${ADAPTOR_DIR}/codegen/gen.py --diopi_dir=${DIOPI_IMPL_DIR}/../ --output_dir=${ADAPTOR_CSRC_PATH} --config_device=torch - BYPRODUCTS ${GEN_FILES} - DEPENDS adaptor_gen_dependency) - list(APPEND REAL_IMPL_SRC ${ADAPTOR_CSRC_PATH}/convert.cpp ${ADAPTOR_CSRC_PATH}/diopi_adaptor.cpp ${ADAPTOR_CSRC_PATH}/composite_ops.cpp) + diopi_use_adapter(${ADAPTOR_DIR} ${DIOPI_IMPL_DIR} "torch" "" REAL_IMPL_SRC) endif() +if(${DYLOAD}) + prep_dyn_load(${DIOPI_IMPL_DIR} ${DEVICEIMPL}) + set(REAL_IMPL diopi_real_impl) +else() + set(REAL_IMPL ${DEVICEIMPL}) +endif() if(HIP) include(../cmake/LoadHIP.cmake) -endif() - -if(USE_HIP) add_definitions(-DUSE_HIP) # hipify-perl execute_process( @@ -71,40 +43,22 @@ if(USE_HIP) execute_process( COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/hipify-amend.sh ) -endif() + hip_add_library(${REAL_IMPL} SHARED ${REAL_IMPL_SRC}) + target_link_libraries(${REAL_IMPL} ${HIP_LIBRARIES} ${TORCH_LIBRARIES}) -if (DYLOAD) - set(REALIMPL diopi_real_impl) - if(USE_HIP) - hip_add_library(${REALIMPL} SHARED ${REAL_IMPL_SRC}) - add_library(${DEVICEIMPL} SHARED ${IMPL_SRC}) - target_link_libraries(${DEVICEIMPL} -ldl ${HIP_LIBRARIES}) - else() - cuda_add_library(${REALIMPL} SHARED ${REAL_IMPL_SRC}) - add_library(${DEVICEIMPL} SHARED ${IMPL_SRC}) - target_link_libraries(${DEVICEIMPL} -ldl ${CUDA_LIBRARIES}) - endif() - add_dependencies(${DEVICEIMPL} code_gen) # make sure to recompile once code_gen.py changed - target_link_libraries(${REALIMPL} ${TORCH_LIBRARIES}) - if (TEST) - add_dependencies(${REALIMPL} diopirt) - target_link_libraries(${REALIMPL} diopirt) - endif() -else () - if(USE_HIP) - hip_add_library(${DEVICEIMPL} SHARED ${REAL_IMPL_SRC}) - target_link_libraries(${DEVICEIMPL} ${HIP_LIBRARIES} ${TORCH_LIBRARIES}) - else() - cuda_add_library(${DEVICEIMPL} SHARED ${REAL_IMPL_SRC}) - # target_compile_definitions(${DEVICEIMPL} PRIVATE __CUDA_NO_HALF_OPERATORS__) - target_link_libraries(${DEVICEIMPL} ${CUDA_LIBRARIES} ${TORCH_LIBRARIES}) - add_subdirectory(functions/functions_ext/flash-attention) - target_link_libraries(${DEVICEIMPL} diopi_torch_ext_flash_attn) - endif() +else() + cuda_add_library(${REAL_IMPL} SHARED ${REAL_IMPL_SRC}) + # target_compile_definitions(${REAL_IMPL} PRIVATE __CUDA_NO_HALF_OPERATORS__) + target_link_libraries(${REAL_IMPL} ${CUDA_LIBRARIES} ${TORCH_LIBRARIES}) + add_subdirectory(functions/functions_ext/flash-attention) + target_link_libraries(${REAL_IMPL} diopi_torch_ext_flash_attn) endif() if(USE_ADAPTOR) - add_dependencies(${DEVICEIMPL} adaptor_code_gen) + add_dependencies(${REAL_IMPL} adaptor_code_gen) +endif() +if(${DYLOAD}) + handle_dyn_torch(${DIOPI_IMPL_DIR} ${REAL_IMPL} ${TORCH_INSTALL_PREFIX} ${DEVICEIMPL}) endif() if(DIOPI_TORCH_UNSAFE_BUILDATEN) diff --git a/impl/torch/cmake/TorchBaseFunc.cmake b/impl/torch/cmake/TorchBaseFunc.cmake deleted file mode 100644 index a8c925aa07..0000000000 --- a/impl/torch/cmake/TorchBaseFunc.cmake +++ /dev/null @@ -1,11 +0,0 @@ - -macro(InitFindTorch) - execute_process( - COMMAND sh -c "python -c 'import torch;print(torch.utils.cmake_prefix_path)'" - OUTPUT_VARIABLE DIOPI_TORCH_CMAKE_PREFIX - OUTPUT_STRIP_TRAILING_WHITESPACE) - message(STATUS "DIOPI_TORCH_CMAKE_PREFIX:${DIOPI_TORCH_CMAKE_PREFIX}") - if(DIOPI_TORCH_CMAKE_PREFIX) - list(APPEND CMAKE_PREFIX_PATH ${DIOPI_TORCH_CMAKE_PREFIX}) - endif() -endmacro() diff --git a/impl/torch/test/CMakeLists.txt b/impl/torch/test/CMakeLists.txt index cd5e9a7465..dee1bd6156 100644 --- a/impl/torch/test/CMakeLists.txt +++ b/impl/torch/test/CMakeLists.txt @@ -23,7 +23,7 @@ set(EXPORT_SRC message("CXX_LITERT_SRC:" ${CXX_LITERT_SRC}) pybind11_add_module(${DIOPIRT} SHARED ${EXPORT_SRC}) -add_library(diopirt SHARED ${RUNTIME_SRC}) +cuda_add_library(diopirt SHARED ${RUNTIME_SRC}) target_link_libraries(${DIOPIRT} PRIVATE diopirt) target_link_libraries(diopirt ${DEVICEIMPL})