From b704612da2364f9dcc3eca516d4d86e3aefed872 Mon Sep 17 00:00:00 2001 From: Marya Date: Wed, 24 Jan 2024 16:25:05 +0000 Subject: [PATCH] Adding OCK demo documentation --- .../build_vgg_resnet_action/action.yml | 4 +- .github/workflows/run_ock_demo.yml | 14 +- .../build_and_run_networks.py | 0 .../ock_demo_blog/getting_started.md | 528 ++++++++++++++++++ 4 files changed, 544 insertions(+), 2 deletions(-) rename examples/technical_blogs/{tartan_blog => ock_demo_blog}/build_and_run_networks.py (100%) create mode 100644 examples/technical_blogs/ock_demo_blog/getting_started.md diff --git a/.github/actions/build_vgg_resnet_action/action.yml b/.github/actions/build_vgg_resnet_action/action.yml index b571b36fe7..00916a970d 100644 --- a/.github/actions/build_vgg_resnet_action/action.yml +++ b/.github/actions/build_vgg_resnet_action/action.yml @@ -62,6 +62,7 @@ runs: wget --no-verbose https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5 python $portDNN_source/samples/networks/vgg/h5toBin.py vgg16_weights_tf_dim_ordering_tf_kernels.h5 cd .. + mkdir vgg_data && cp -r vdata/* vgg_data/ # Resnet50 mkdir rdata @@ -69,6 +70,7 @@ runs: wget --no-verbose https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5 python $portDNN_source/samples/networks/resnet50/h5toBin.py resnet50_weights_tf_dim_ordering_tf_kernels.h5 cd .. + mkdir resnet_data && cp -r rdata/* resnet_data/ # Preparing an image python $portDNN_source/samples/networks/img2bin.py $(pwd)/Labrador_Retriever_Molly.jpg @@ -81,7 +83,7 @@ runs: - name: Package artifacts shell: bash run: | - tar -cvzf network_artifacts.tar.gz vdata rdata Labrador_Retriever_Molly.jpg Labrador_Retriever_Molly.jpg.bin + tar -cvzf network_artifacts.tar.gz vgg_data resnet_data Labrador_Retriever_Molly.jpg Labrador_Retriever_Molly.jpg.bin - name: Upload Artifacts uses: actions/upload-artifact@v4 diff --git a/.github/workflows/run_ock_demo.yml b/.github/workflows/run_ock_demo.yml index 29ad0b6845..4b09c7c573 100644 --- a/.github/workflows/run_ock_demo.yml +++ b/.github/workflows/run_ock_demo.yml @@ -57,6 +57,18 @@ jobs: mkdir linux_nightly_release tar -xzf sycl_linux.tar.gz -C linux_nightly_release + - name: Clone oneAPI-samples + run: git clone https://github.com/oneapi-src/oneAPI-samples.git + + - name: Setup environment and test Vector Add + run: | + export ONEAPI_ROOT=$(pwd)/linux_nightly_release + export PATH=$(pwd)/linux_nightly_release/bin:$PATH + export LD_LIBRARY_PATH=$(pwd)/linux_nightly_release/lib:$(pwd)/install/lib:$LD_LIBRARY_PATH + + $(pwd)/linux_nightly_release/bin/clang++ -fsycl $(pwd)/oneAPI-samples/DirectProgramming/C++SYCL/DenseLinearAlgebra/vector-add/src/vector-add-buffers.cpp -o vect + CA_RISCV_DUMP_IR=1 CA_HAL_DEBUG=1 OCL_ICD_FILENAMES=$(pwd)/install/lib/libCL.so ONEAPI_DEVICE_SELECTOR=opencl:acc SYCL_CONFIG_FILE_NAME="" ./vect + - name: Test RSICV examples run: | # Set environment variables @@ -93,7 +105,7 @@ jobs: - name: Package Build Artifacts run: | - tar -cvzf ock_demo_artifacts.tar.gz install portDNN_build_dir portBLAS_build_dir build_riscv_tests + tar -cvzf ock_demo_artifacts.tar.gz install portDNN_build_dir portBLAS_build_dir build_riscv_tests vect examples/technical_blogs/ock_demo_blog/getting_started.md - name: Upload Artifacts uses: actions/upload-artifact@v4 diff --git a/examples/technical_blogs/tartan_blog/build_and_run_networks.py b/examples/technical_blogs/ock_demo_blog/build_and_run_networks.py similarity index 100% rename from examples/technical_blogs/tartan_blog/build_and_run_networks.py rename to examples/technical_blogs/ock_demo_blog/build_and_run_networks.py diff --git a/examples/technical_blogs/ock_demo_blog/getting_started.md b/examples/technical_blogs/ock_demo_blog/getting_started.md new file mode 100644 index 0000000000..3573f29257 --- /dev/null +++ b/examples/technical_blogs/ock_demo_blog/getting_started.md @@ -0,0 +1,528 @@ +Getting Started +=== + +## Introduction + +The release package is a self-contained pre-built project to demonstrate how to run OpenCL + and SYCL applications on the RISC-V spike simulator, including a deep neural + network demo that uses the VGG network. + +We have put together a set of guides that serve as a walkthrough for a selection +of OpenCL and SYCL applications. + +The release package contains a number of different directories: + +- oneAPI Construction Kit(OCK) is Codeplay’s multi-target, multi-platform framework. + - A few examples have been added to OCK that are useful for analyzing and demonstrating + the performance of OCK and oneAPI on RISC-V. +- portDNN and portBLAS network sample project that executes VGG and Resnet50 networks + +```Note: cuda-to-sycl-nbody is in the process of being included to the new OCK demo package``` + +For dpc++ support, Intel oneAPI nightly releases can be downloaded from [here](https://github.com/intel/llvm/releases). A few sycl examples can be used for testing from [oneAPI-Samples](https://github.com/oneapi-src/oneAPI-samples). + +You can run SYCL code using the `vector-add` from oneAPI-samples and samples from +OCK's `examples/applications`. Both directories implement SYCL code and therefore demonstrate +running SYCL code using the oneAPI. + +You can also run an implementation of the VGG and Resnet50 neural networks +written using portDNN, SYCL and C++. + +## Installation and Setup + +OCK demo package contains the build and install directories of all components. All the repositories mentioned above are open-sourced and can be cloned as follows. + +- [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit.git) +- [portDNN](https://github.com/codeplaysoftware/portDNN.git) +- [portBLAS](https://github.com/codeplaysoftware/portBLAS.git) +- [oneAPI Samples](https://github.com/oneapi-src/oneAPI-samples) +- [oneAPI toolkit](https://github.com/intel/llvm/releases) +- [cuda-to-sycl-nbody](https://github.com/codeplaysoftware/cuda-to-sycl-nbody.git) + +The above repositories can be built for dpc++ following their respective build instructions. +All of the following instructions have been tested on Ubuntu:20.04. + +You'll need to extract the files from the archive file: + +```sh + mkdir Release + cd Release + tar -xf ../ock_demo_artifacts.tar.gz + tar -xf ../network_artifacts.tar.gz + export RELEASE_DIR=$PWD +``` + +Once you have extracted the files from the archive file, you should have a +hierarchy that looks like this: + +``` +- WORKDIR: $RELEASE_DIR + - DIR: build_riscv_test/ + - DIR: install/ + - DIR: portBLAS_build_dir/ + - DIR: portDNN_build_dir/ + - DIR: resnet_data + - DIR: vgg_data + - FILE: Labrador_Retriever_Molly.jpg + - FILE: Labrador_Retriever_Molly.jpg.bin + - FILE: get-started.md + - FILE: vect +``` + +## Running the examples + +Once you have followed either the local installation or untared prebuilt OCK demo, you are ready to run some OpenCL and SYCL code. + +This section will be split in four parts: +- Running the vector-add from oneAPI-samples. +- Running the VGG16 and ResNet50 neural networks. +- Running the samples from OCK. + +### oneAPI-Samples + +oneAPI-samples is a comprehensive repository containing a lot of non-sycl examples, but for the sake of our example, we will be only focusing on the ```vector-add``` one. This sample does what it says, it implements adding two vectors together using SYCL. + +```sh + cd $RELEASE_DIR + CA_RISCV_DUMP_IR=1 CA_HAL_DEBUG=1 OCL_ICD_FILENAMES=$RELEASE_DIR/install/lib/libCL.so ONEAPI_DEVICE_SELECTOR=opencl:acc SYCL_CONFIG_FILE_NAME="" ./vect +``` + +This is the expected output you should have: + +``` +refsi_hal_device::mem_alloc(size=40000, align=128) -> 0xb8006300 +refsi_hal_device::mem_write(dst=0xb8006300, size=40000) +refsi_hal_device::mem_alloc(size=40000, align=128) -> 0xb7ffc680 +refsi_hal_device::mem_write(dst=0xb7ffc680, size=40000) +refsi_hal_device::mem_alloc(size=40000, align=128) -> 0xb7ff2a00 +refsi_hal_device::mem_write(dst=0xb7ff2a00, size=40000) +; ModuleID = 'SPIR-V' +source_filename = "SPIR-V" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown-elf" + +%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" } +%"class.sycl::_V1::detail::array" = type { [1 x i64] } + +; Function Attrs: nofree norecurse nounwind memory(read, argmem: readwrite) +define spir_kernel void @_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EE(ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_UserRange, ptr addrspace(1) nocapture writeonly %_arg_sum, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_sum3, ptr addrspace(1) nocapture readonly %_arg_a, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_a6, ptr addrspace(1) nocapture readonly %_arg_b, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_b9) local_unnamed_addr #0 { +entry: + %0 = load i64, ptr %_arg_UserRange, align 8 + %1 = load i64, ptr %_arg_sum3, align 8 + %add.ptr.i = getelementptr inbounds i32, ptr addrspace(1) %_arg_sum, i64 %1 + %2 = load i64, ptr %_arg_a6, align 8 + %add.ptr.i36 = getelementptr inbounds i32, ptr addrspace(1) %_arg_a, i64 %2 + %3 = load i64, ptr %_arg_b9, align 8 + %add.ptr.i45 = getelementptr inbounds i32, ptr addrspace(1) %_arg_b, i64 %3 + %4 = tail call i64 @__mux_get_global_id(i32 0) #5 + %5 = tail call i64 @__mux_get_global_size(i32 0) #5 + %cmp6.not.i.not.i = icmp ult i64 %4, %0 + br i1 %cmp6.not.i.not.i, label %for.body.i, label %_ZNK4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EclES4_.exit + +for.body.i: ; preds = %entry, %for.body.i + %Gen.sroa.0.0.i1 = phi i64 [ %add.i17.i, %for.body.i ], [ %4, %entry ] + %arrayidx.i.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i36, i64 %Gen.sroa.0.0.i1 + %6 = load i32, ptr addrspace(1) %arrayidx.i.i, align 4 + %arrayidx.i9.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i45, i64 %Gen.sroa.0.0.i1 + %7 = load i32, ptr addrspace(1) %arrayidx.i9.i, align 4 + %add.i.i = add nsw i32 %7, %6 + %arrayidx.i15.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i, i64 %Gen.sroa.0.0.i1 + store i32 %add.i.i, ptr addrspace(1) %arrayidx.i15.i, align 4 + %add.i17.i = add i64 %Gen.sroa.0.0.i1, %5 + %cmp6.i.i = icmp ult i64 %add.i17.i, %0 + br i1 %cmp6.i.i, label %for.body.i, label %_ZNK4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EclES4_.exit + +_ZNK4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EclES4_.exit: ; preds = %for.body.i, %entry + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.assume(i1 noundef) #1 + +; Function Attrs: nofree nounwind memory(read, argmem: readwrite, inaccessiblemem: readwrite) +define spir_kernel void @_ZTSZZ9VectorAddRN4sycl3_V15queueERKSt6vectorIiSaIiEES7_RS5_ENKUlRNS0_7handlerEE_clESA_EUlT_E_(ptr addrspace(1) nocapture writeonly %_arg_sum, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_sum3, ptr addrspace(1) nocapture readonly %_arg_a, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_a6, ptr addrspace(1) nocapture readonly %_arg_b, ptr nocapture readonly byval(%"class.sycl::_V1::range") %_arg_b9) local_unnamed_addr #2 { +entry: + %0 = load i64, ptr %_arg_sum3, align 8 + %add.ptr.i = get### RVV instructions + +The RISC-V Vector extension (RVV) enables processor cores based on the RISC-V +instruction set architecture to process data arrays, alongside traditional +scalar operations to accelerate the computation of single instruction streams +on large data sets. + +By setting the following environment variables, you will enable the vectorizer +and the RISC-V vector support: + +- CA_RISCV_VF +- CA_RISCV_VLEN_BITS_MIN + +Here is how to run the simple-vector-add sample with vectorization enabled by +setting the previously shown environment variables: + +First, check you are in the right directory to run the sample: + +```sh + cd $RELEASE_DIR/RISCV-examples/bin +``` + +Then run the command: + +```sh + env CA_RISCV_VLEN_BITS_MIN=128 CA_RISCV_VF=4 ./simple-vector-add +``` + +The CA_RISCV_VLEN_BITS_MIN variable is used to set the numbers of bits in a +single vector register. This variable must be a power of two and must be no +greater than 4096. It defaults to 128. + +The CA_RISCV_VF variable is the vectorization factor that set the number of +elements for the width of the vector. + +To see the generated code, you can use the environment variables +to emit IR or assembly. The following sections [Emitting IR](#emitting-ir) and +[Emitting ASM](#emitting-asm) will show you how to it. + +Furthermore, you can also use the interactive debug mode to see at runtime the +executed instructions which in this case will show you some RVV instructions.elementptr inbounds i32, ptr addrspace(1) %_arg_sum, i64 %0 + %1 = load i64, ptr %_arg_a6, align 8 + %add.ptr.i34 = getelementptr inbounds i32, ptr addrspace(1) %_arg_a, i64 %1 + %2 = load i64, ptr %_arg_b9, align 8 + %add.ptr.i43 = getelementptr inbounds i32, ptr addrspace(1) %_arg_b, i64 %2 + %3 = tail call i64 @__mux_get_global_id(i32 0) #5 + %arrayidx.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i34, i64 %3 + %4 = load i32, ptr addrspace(1) %arrayidx.i, align 4 + %arrayidx.i49 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i43, i64 %3 + %5 = load i32, ptr addrspace(1) %arrayidx.i49, align 4 + %add.i = add nsw i32 %5, %4 + %cmp.i.i52 = icmp ult i64 %3, 2147483648 + tail call void @llvm.assume(i1 %cmp.i.i52) + %arrayidx.i55 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i, i64 %3 + store i32 %add.i, ptr addrspace(1) %arrayidx.i55, align 4 + ret void +} + +; Function Attrs: noinline nounwind optnone +define spir_func void @__itt_offload_wi_start_stub(ptr addrspace(4) %group_id, i64 %wi_id, i32 %wg_size) local_unnamed_addr #3 { +entry: + %group_id.addr = alloca ptr addrspace(4), align 8 + %wi_id.addr = alloca i64, align 8 + %wg_size.addr = alloca i32, align 4 + %group_id.addr.ascast = addrspacecast ptr %group_id.addr to ptr addrspace(4) + %wi_id.addr.ascast = addrspacecast ptr %wi_id.addr to ptr addrspace(4) + %wg_size.addr.ascast = addrspacecast ptr %wg_size.addr to ptr addrspace(4) + store ptr addrspace(4) %group_id, ptr addrspace(4) %group_id.addr.ascast, align 8 + store i64 %wi_id, ptr addrspace(4) %wi_id.addr.ascast, align 8 + store i32 %wg_size, ptr addrspace(4) %wg_size.addr.ascast, align 4 + ret void +} + +; Function Attrs: noinline nounwind optnone +define spir_func void @__itt_offload_wi_finish_stub(ptr addrspace(4) %group_id, i64 %wi_id) local_unnamed_addr #3 { +entry: + %group_id.addr = alloca ptr addrspace(4), align 8 + %wi_id.addr = alloca i64, align 8 + %group_id.addr.ascast = addrspacecast ptr %group_id.addr to ptr addrspace(4) + %wi_id.addr.ascast = addrspacecast ptr %wi_id.addr to ptr addrspace(4) + store ptr addrspace(4) %group_id, ptr addrspace(4) %group_id.addr.ascast, align 8 + store i64 %wi_id, ptr addrspace(4) %wi_id.addr.ascast, align 8 + ret void +} + +; Function Attrs: alwaysinline norecurse nounwind memory(read) +declare i64 @__mux_get_global_size(i32) #4 + +; Function Attrs: alwaysinline norecurse nounwind memory(read) +declare i64 @__mux_get_global_id(i32) #4 + +attributes #0 = { nofree norecurse nounwind memory(read, argmem: readwrite) "mux-kernel"="entry-point" "mux-orig-fn"="_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EE" "vecz-mode"="auto" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "vecz-mode"="auto" } +attributes #2 = { nofree nounwind memory(read, argmem: readwrite, inaccessiblemem: readwrite) "mux-kernel"="entry-point" "mux-orig-fn"="_ZTSZZ9VectorAddRN4sycl3_V15queueERKSt6vectorIiSaIiEES7_RS5_ENKUlRNS0_7handlerEE_clESA_EUlT_E_" "vecz-mode"="auto" } +attributes #3 = { noinline nounwind optnone "vecz-mode"="auto" } +attributes #4 = { alwaysinline norecurse nounwind memory(read) "vecz-mode"="auto" } +attributes #5 = { alwaysinline norecurse nounwind memory(read) } + +!llvm.ident = !{!0} +!opencl.kernels = !{!1, !8} +!opencl.ocl.version = !{!15} + +!0 = !{!"Source language: OpenCL C++, Version: 100000"} +!1 = !{ptr @_ZTSN4sycl3_V16detail18RoundedRangeKernelINS0_4itemILi1ELb1EEELi1EZZ9VectorAddRNS0_5queueERKSt6vectorIiSaIiEESB_RS9_ENKUlRNS0_7handlerEE_clESE_EUlT_E_EE, !2, !3, !4, !5, !6, !7} +!2 = !{!"kernel_arg_addr_space", i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!3 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none", !"none"} +!4 = !{!"kernel_arg_type", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*"} +!5 = !{!"kernel_arg_base_type", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*"} +!6 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !"", !""} +!7 = !{!"kernel_arg_name", !"_arg_UserRange", !"_arg_sum", !"_arg_sum3", !"_arg_a", !"_arg_a6", !"_arg_b", !"_arg_b9"} +!8 = !{ptr @_ZTSZZ9VectorAddRN4sycl3_V15queueERKSt6vectorIiSaIiEES7_RS5_ENKUlRNS0_7handlerEE_clESA_EUlT_E_, !9, !10, !11, !12, !13, !14} +!9 = !{!"kernel_arg_addr_space", i32 1, i32 0, i32 1, i32 0, i32 1, i32 0} +!10 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!11 = !{!"kernel_arg_type", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*"} +!12 = !{!"kernel_arg_base_type", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*", !"uint*", !"class sycl::_V1::range*"} +!13 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!14 = !{!"kernel_arg_name", !"_arg_sum", !"_arg_sum3", !"_arg_a", !"_arg_a6", !"_arg_b", !"_arg_b9"} +!15 = !{i32 3, i32 0} +refsi_hal_device::program_find_kernel(name='_ZTSZZ9VectorAddRN4sycl3_V15queueERKSt6vectorIiSaIiEES7_RS5_ENKUlRNS0_7handlerEE_clESA_EUlT_E_.mux-kernel-wrapper') -> 0x000101b2 +refsi_hal_device::kernel_exec(kernel=0x000101b2, num_args=6, global=<10000:1:1>, local=<16:1:1>) +refsi_hal_device::mem_read(src=0xb7ff2a00, size=40000) +refsi_hal_device::mem_free(address=0xb7ff2a00) +refsi_hal_device::mem_free(address=0xb7ffc680) +refsi_hal_device::mem_free(address=0xb8006300) +Running on device: RefSi M1 +Vector size: 10000 +[0]: 0 + 0 = 0 +[1]: 1 + 1 = 2 +[2]: 2 + 2 = 4 +... +[9999]: 9999 + 9999 = 19998 +Vector add successfully completed on device. +``` + +### VGG16 and ResNet50 + +The `portDNN` directory contains two implementations of VGG16 and +ResNet50, one in C++ and one in Python. Example input image and its pre-processed +version are provided in `$RELEASE_DIR`. Also VGG16 and ResNet50 modles are pre-downloaded +in `$RELEASE_DIR/vgg_data` and `$RELEASE_DIR\resnet_data` respectively. + +#### Running VGG16 & ResNet50 +``` + # Testing on image for VGG16 + CA_HAL_DEBUG=1 CA_PROFILE_LEVEL=3 OCL_ICD_FILENAMES=$RELEASE_DIR/install/lib/libCL.so ONEAPI_DEVICE_SELECTOR=opencl:acc SYCL_CONFIG_FILE_NAME= $RELEASE_DIR/portDNN_build_dir/samples/networks/vgg/vgg vgg_data/ $RELEASE_DIR/Labrador_Retriever_Molly.jpg.bin + + # Testing on image for Resnet50 + CA_HAL_DEBUG=1 CA_PROFILE_LEVEL=3 OCL_ICD_FILENAMES=$RELEASE_DIR/install/lib/libCL.so ONEAPI_DEVICE_SELECTOR=opencl:acc SYCL_CONFIG_FILE_NAME= $RELEASE_DIR/portDNN_build_dir/samples/networks/resnet50/resnet50 resnet_data/ $(pwd)/Labrador_Retriever_Molly.jpg.bin +``` + +This may take a few minutes and the expected output should end like this: + +``` +refsi_hal_device::mem_free(address=0x72bc9a80) +refsi_hal_device::mem_free(address=0xb475ba80) +refsi_hal_device::mem_free(address=0x75209a80) +refsi_hal_device::mem_free(address=0xb475b280) +refsi_hal_device::mem_free(address=0x72b67a80) +refsi_hal_device::mem_free(address=0x72b05a80) +refsi_hal_device::mem_free(address=0x72aa3a80) +refsi_hal_device::mem_free(address=0x9bf5b280) +refsi_hal_device::mem_free(address=0x72a8b280) +refsi_hal_device::mem_free(address=0x9bf57280) +refsi_hal_device::mem_free(address=0x72a87280) +refsi_hal_device::mem_free(address=0x72a83280) +refsi_hal_device::mem_free(address=0x97f57280) +refsi_hal_device::mem_free(address=0x72a7f280) +refsi_hal_device::mem_free(address=0x97f53280) +refsi_hal_device::mem_free(address=0x72a7b280) +refsi_hal_device::mem_free(address=0x72a77280) +refsi_hal_device::mem_free(address=0x96fb3280) +refsi_hal_device::mem_free(address=0x72a73280) +refsi_hal_device::mem_free(address=0x96fb2280) +refsi_hal_device::mem_free(address=0x72a72280) +refsi_hal_device::mem_free(address=0x72a70200) +refsi_hal_device::mem_free(address=0x72a71200) +refsi_hal_device::mem_free(address=0x72a71280) +refsi_hal_device::mem_free(address=0xb7f7cf80) +1332923491164 ns +1333565282439 ns +1332118203098 ns +1331295811286 ns +1330008188846 ns +1332405637563 ns +1332031223909 ns +1332465310963 ns +[+] total retired instructions: 78097436433 +[+] total elapsed cycles: 78097436433 +[+] total direct memory write access: 579MB +[+] total direct memory read access: 3KB +``` + +### OCK-Examples + +The OCK examples/applications directory contains examples and benchmarks that are useful +for analyzing and demonstrating the performance of OCK on RISC-V. + +Like the previous subsections, this directory contains more than one executable. + +We will focus on ```clVectorAddition``` in this subsection. + +Here's how to run ```clVectorAddition```: + +```sh + cd $RELEASE_DIR/build_riscv_tests/bin + CA_HAL_DEBUG=1 CA_PROFILE_LEVEL=3 OCL_ICD_FILENAMES=$RELEASE_DIR/install/lib/libCL.so ONEAPI_DEVICE_SELECTOR=opencl:acc SYCL_CONFIG_FILE_NAME= ./clVectorAddition +``` + +This is the expected output you should have: + +``` +1: Test timeout computed to be: 10000000 +1: refsi_hal_device::mem_alloc(size=256, align=128) -> 0xb800fe80 +1: refsi_hal_device::mem_alloc(size=256, align=128) -> 0xb800fd80 +1: refsi_hal_device::mem_alloc(size=256, align=128) -> 0xb800fc80 +1: refsi_hal_device::mem_write(dst=0xb800fe80, size=256) +1: refsi_hal_device::mem_write(dst=0xb800fd80, size=256) +1: refsi_hal_device::program_find_kernel(name='vector_addition.mux-kernel-wrapper') -> 0x00010000 +1: refsi_hal_device::kernel_exec(kernel=0x00010000, num_args=3, global=<64:1:1>, local=<64:1:1>) +1: refsi_hal_device::mem_read(src=0xb800fc80, size=256) +1: refsi_hal_device::mem_free(address=0xb800fe80) +1: refsi_hal_device::mem_free(address=0xb800fd80) +1: refsi_hal_device::mem_free(address=0xb800fc80) +1: Available platforms are: +1: 1. ComputeAorta +1: +1: Selected platform 1 +1: +1: Running example on platform 1 +1: Available devices are: +1: 1. RefSi M1 +1: +1: Selected device 1 +1: +1: Running example on device 1 +1: * Created context +1: * Built program +1: * Created buffers +1: * Created kernel and set arguments +1: * Created command queue +1: * Enqueued writes to source buffers +1: * Enqueued NDRange kernel +1: * Enqueued read from destination buffer +1: * Result verified +1: * Released all created OpenCL objects +1: +1: Example ran successfully, exiting +1: [+] total retired instructions: 98 +1: [+] total elapsed cycles: 98 +1: [+] total direct memory write access: 1KB +1: [+] total direct memory read access: 256B +``` + +## Debugging, profiling and examining low level instructions + +The following subsections describe how to debug, profile and examine the low +level instructions emitted during program execution using the samples and +benchmarks we previously built. + +The following examples will be using the ```simple-vector-add``` kernel from +the RISCV-examples directory. + +```sh + cd $RELEASE_DIR/RISCV-examples/bin +``` + +### Profiler + +First, check you are in the right directory to run the sample: + +```sh + cd $RELEASE_DIR/RISCV-examples/bin +``` + +The profiler is a built-in feature that enables you to profile or track +performance of a kernel. It will display statistics gathered during the +execution of that kernel. + +There are several ways that you can enable the profiler and store those +information. + +Here's the variables that will be used: +- ```CA_PROFILE_LEVEL``` +- ```CA_PROFILE_CSV_PATH``` + +#### CA_PROFILE_LEVEL + +The profiler has four distinct levels to perform tracking. Here's how to set +this variable when running the ```simple-vector-add``` kernel: + +```sh + env CA_PROFILE_LEVEL= ./simple-vector-add +``` + +Level 0 means that no profiling will be done on the kernel. + +Level 1 enables the profiler and will output to standard error the +number of host read and host write as well as the total number of retired +instructions and cycles. + +Here's what it looks like when running the ```simple-vector-add``` kernel: + +```sh + env CA_PROFILE_LEVEL=1 ./simple-vector-add +``` + +``` +The results are correct! + +[+] total retired instructions: 156 +[+] total elapsed cycles: 156 +[+] total direct memory write access: 96B +[+] total direct memory read access: 96B +``` + +If this line is commented out a higher number of instructions is generated and +consequently the assembly is more complicated. + +Level 2 enables everything from level 1. It will also write to a file +more information formatted as CSV. The file path by default is +```/tmp/riscv.csv```. We will see later how we can modify this file path. + +Here's what the file ```/tmp/riscv.csv``` will contain after running the +```simple-vector-add``` kernel: + +```sh + env CA_PROFILE_LEVEL=2 ./simple-vector-add +``` + +``` +kernel_name,hart,retired_inst,cycles,host_write,host_read,,,,,16,, +,,,,16,, +,,,,16,, +_ZTS10SimpleVaddIiE,0,317,317,,, +_ZTS10SimpleVaddIiE,1,121,121,,, +,,,,,16, +,,,,16,, +,,,,16,, +,,,,16,, +_ZTS10SimpleVaddIfE,0,313,313,,, +_ZTS10SimpleVaddIfE,1,121,121,,, +,,,,,16, +``` + +Finally, level 3 enables everything from level 1 and level 2. +It will also write to the same file path formatted as CSV with more information +about the kernel execution. + +Here's what the file ```/tmp/riscv.csv``` will contain after running the +```simple-vector-add``` kernel: + +```sh + env CA_PROFILE_LEVEL=3 ./simple-vector-add +``` + +``` +kernel_name,hart,retired_inst,cycles,int_inst,float_inst,branches_inst,mem_read_inst,mem_read_bytes_inst,mem_read_short_inst,mem_read_word_inst,mem_read_double_inst,mem_read_quad_inst,mem_write_inst,mem_write_bytes_inst,mem_write_short_inst,mem_write_word_inst,mem_write_double_inst,mem_write_quad_inst,host_write,host_read, +,,,,,,,,,,,,,,,,,,,16,, +,,,,,,,,,,,,,,,,,,,16,, +,,,,,,,,,,,,,,,,,,,16,, +_ZTS10SimpleVaddIiE,0,317,317,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,, +_ZTS10SimpleVaddIiE,1,121,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,, +,,,,,,,,,,,,,,,,,,,,16, +,,,,,,,,,,,,,,,,,,,16,, +,,,,,,,,,,,,,,,,,,,16,, +,,,,,,,,,,,,,,,,,,,16,, +_ZTS10SimpleVaddIfE,0,313,313,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,, +_ZTS10SimpleVaddIfE,1,121,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,, +,,,,,,,,,,,,,,,,,,,,16, +``` + +#### CA_PROFILE_CSV_PATH + +We've previously seen that with the profiler enabled in level 2 and level 3, +the performance analysis information will be written to a default file path. +This file path can be modified by setting the ```CA_PROFILE_CSV_PATH``` + +Here's how to run the profiler on level 2 and write the gathered information +to your given file path: + +```sh + env CA_PROFILE_LEVEL=3 CA_PROFILE_CSV_PATH= ./simple-vector-add +``` \ No newline at end of file