diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json new file mode 100644 index 0000000000..2b8e56ba4b --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json @@ -0,0 +1,17 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "compilerArgs": [ "-fsycl" ], + "cStandard": "gnu17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json new file mode 100644 index 0000000000..df305ab886 --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json @@ -0,0 +1,35 @@ +{ + "configurations": [ + { + "name": "C/C++: Intel icpx build and debug gaussian_blur", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/bin/${config:programName}_d", + "args": ["${workspaceFolder}/bin/sample_image.jpg"], + "stopAtEntry": true, + "cwd": "${fileDirname}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + }, + { + "description": "Needed by Intel oneAPI: Disable target async", + "text": "set target-async off", + "ignoreFailures": true + } + ], + "preLaunchTask": "gaussian_blur Debug C/C++: Intel icpx build active file", + "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi" + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json new file mode 100644 index 0000000000..d7ab33f09f --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "programName": "gaussian_blur" +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json new file mode 100644 index 0000000000..70aca5a0ce --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json @@ -0,0 +1,49 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cppbuild", + "label": "gaussian_blur Debug C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-fdiagnostics-color=always", + "-fsycl-device-code-split=per_kernel", + "-fno-limit-debug-info", + "-g", + "-O0", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}_d" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx" + }, + { + "type": "cppbuild", + "label": "gaussian_blur Release C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-DNDEBUG", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx" + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt new file mode 100644 index 0000000000..9a59e896ed --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt @@ -0,0 +1,16 @@ +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") + set(CMAKE_C_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "icpx") +endif() +set(CMAKE_CXX_STANDARD 17) +if(NOT DEFINED ${CMAKE_BUILD_TYPE}) + set(CMAKE_BUILD_TYPE "RELEASE") +endif() +if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +endif() +cmake_minimum_required (VERSION 3.4) +project (gaussian_blur) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png new file mode 100644 index 0000000000..6decbd0b6c Binary files /dev/null and b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png differ diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg new file mode 100644 index 0000000000..be810ad43e Binary files /dev/null and b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg differ diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md new file mode 100644 index 0000000000..06f7782232 --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md @@ -0,0 +1,101 @@ +# Image Gaussian Blur example program + +## Purpose +This SYCL code example implements a Gaussian blur filter, blurring +either a JPG or PNG image from the command line. The original file is not modified. +The output file is in a PNG format. + +__Output Image:__ + +![Gaussian blur input](images/sample_image.jpg)
+![Gaussian blur output](images/sample_image-blurred.png) + +## Prerequisites + +| Minimum Requirements | Description +|:--- |:--- +| OS | Linux* Ubuntu* 20.04.5 LTS +| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics +| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite) +| Libraries | Install Intel oneAPI Base Toolkit +| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns, a .png capable image viewer + +## Build and Run using Visual Code Studio + +### Linux* + +Within a terminal window change directory to this project's folder. At the +terminal prompt type: + +``` +cd ImageGuassianBlur +code . +``` + +Visual Studio Code will open this project displaying its files in the Explorer +pane. +The project is already set up with build configurations to build either a +debug build or a release build of the program. When a program is built, it is +placed in the bin directory of this project's top folder. + +To build the program hit Ctrl+Shift+b and choose the type of program to build. +The debug executable will have a '_d' appended to its name. + +To blur an image, copy the images/sample_image.jpg to the bin directory. +To execute the program, type in the Visual Studio Code terminal window: +``` +cd bin +./gaussian_blur_d sample_image.jpg +``` +A new image file will appear in the bin directory 'sample_image-blurred.png'. +To view the image, select it in the directory folder app and hit return. +Ubuntu will display the image using the preview app. + +## Build and Run using CMake +### Linux* +``` +mkdir build +cd build +cmake .. +make +``` + +To blur an image, copy the images/sample_image.jpg to the directory of the new +executable. Type in the terminal window: + +``` +cd build/src +./gaussian_blur sample_image.jpg +``` +Open the resulting file: `sample_image-blurred.png` with an image viewer. + +## Debug the program using Visual Studio Code + +### Linux* + +Due to an issue with the image load library function stbi_load, make the +directory bin (if it does not exist already) and copy the sample_image.jpg +file into it. This will allow the program to find the file and continue the +debug session. + +To debug the program, either choose from the IDE's run menu +'Start debugging' or hit F5 on the keyboard. +The debug launch.json configuration file defines the debug session to: +* To halt the program at the first line of code after main(). +Use the GUI debug panel's buttons to step over code (key F10) lines to see the +program advance. +Breakpoints can be set either in the main code or the kernel code. + +Note: Setting breakpoints in the kernel code does not present the normal + step through code behavior. Instead a breakpoint event is occurring + on each thread being executed and so switches to the context of + that thread. To step through the code of a single thread, use the + Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the + IDE's debug console prompt. As this is not the main thread, be sure + to revert this setting on returning to debug any host side code. + Use the command 'set scheduler-locking replay' or 'off'. + +## License + +Code samples are licensed under the Apache 2.0 license. See +[LICENSE.txt](LICENSE.txt) for details. diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt new file mode 100644 index 0000000000..53d0c0958a --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt @@ -0,0 +1,6 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl") +set(CMAKE_BUILD_TYPE "RelWithDebInfo") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +add_executable (gaussian_blur gaussian_blur.cpp) +target_link_libraries(gaussian_blur OpenCL sycl) + diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp new file mode 100644 index 0000000000..8fd35018dd --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp @@ -0,0 +1,325 @@ +//============================================================================ +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// =========================================================================== + +//**************************************************************************** +// +// Description: +// This advanced SYCL code example implements a Gaussian blur filter, blurring +// a JPG or PNG image from the command line. The original file is not modified. +// The output file is a PNG image. +// +// Usage: +// The program blurs an image provided on the command line. +// +//***************************************************************************** + +// SYCL or oneAPI toolkit headers: +#include + +// Third party headers: +#include +#include +// These public domain headers implement useful image reading and writing +// functions. Find in ${oneAPI}/dev-utilities/include +#define STB_IMAGE_IMPLEMENTATION +#include "stb/stb_image.h" +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include "stb/stb_image_write.h" + +// Forward declaration of this example's SYCL kernels +class KernelFillGaussian; +class KernelGaussian; + +using namespace sycl; +using namespace std; + +// Attempts to determine a good local size. The best way to *control* +// performance is to choose the sizes. The method here is to choose the +// largest number, leq 64, which is a power-of-two, and divides the global +// work size evenly. In this code, it might prove most optimal to pad the +// image along one dimension so that the local size could be 64, but this +// introduces other complexities. +range< 2 > GetOptimalLocalRange( range< 2 > globalSize, device hw ) +{ + range< 2 > optimalLocalSize{ 0, 0 }; + + // 64 is a good local size on GPU-like devices, as each compute unit is + // made of many smaller processors. On non-GPU devices, 4 is a common vector + // width. + if( hw.is_gpu() ) + { + optimalLocalSize = range< 2 >( 64, 1 ); + } + else + { + optimalLocalSize = range< 2 >( 4, 1 ); + } + + // Here, for each dimension, we make sure that it divides the global size + // evenly. If it doesn't, we try the next lowest power of two. Eventually + // it will reach one, if the global size has no power of two component. + for( int i = 0; i < 2; ++i ) + { + while( globalSize[ i ] % optimalLocalSize[ i ] != 0 ) + { + optimalLocalSize[ i ] = optimalLocalSize[ i ] >> 1; + } + } + + return optimalLocalSize; +} + +// Asynchronous errors hander, catch faults in asynchronously executed code +// inside a command group or a kernel. They can occur in a different stackframe, +// asynchronous error cannot be propagated up the stack. +// By default, they are considered 'lost'. The way in which we can retrieve them +// is by providing an error handler function. +auto exception_handler = []( sycl::exception_list exceptions ) +{ + for( std::exception_ptr const &e : exceptions ) + { + try + { + std::rethrow_exception( e ); + } + catch( sycl::exception const &e ) + { + std::cout << + "Queue handler caught asynchronous SYCL exception:\n" << + e.what() << std::endl; + } + } +}; + +// The Gaussian program +int main( int argc, char* argv[] ) +{ + bool bProgramError = false; + + // Validate user input + if( argc < 2 ) + { + std::cout + << "Please provide a JPEG or PNG image as an argument to this program." + << std::endl; + } + + // ******************** + // Input image handling + // ******************** + // The image dimensions will be set by the library, as will the number of + // channels. However, passing a number of channels will force the image + // data to be returned in that format, regardless of what the original image + // looked like. The header has a mapping from int values to types - 4 means + // RGBA. + int inputWidth = 0; + int inputHeight = 0; + int inputChannels = 0; + + // Number of color channels RGBA// Project files: + const int numChannels = 4; + const char *pImageFileName = argv[ 1 ]; + unique_ptr< unsigned char [] > pInputImg( stbi_load( pImageFileName, + &inputWidth, &inputHeight, &inputChannels, numChannels ) ); + if( pInputImg == nullptr ) + { + bProgramError = true; + std::cout << "Failed to load image file (is argv[1] a valid image file?)" + << std::endl; + exit(-1); + } + + // RAII resource + unique_ptr< unsigned char [] > pOutputImg( + new unsigned char[ inputWidth * inputHeight * numChannels ] ); + + try + { + sycl::device hw = device( sycl::cpu_selector_v ); + queue myQueue( hw, exception_handler ); + + // ******************************************* + // Create gaussian convolution matrix and fill + // ******************************************* + const float pi = std::atan( 1 ) * 4; + constexpr auto guasStdDev = 2; + constexpr auto guasDelta = 6; + const int guasMatrixRange = (guasDelta * guasStdDev); + const float guasStdDevFactor = 2 * guasStdDev * guasStdDev; + const float piFactor = guasStdDevFactor * pi; + const int gaussianBlurRange = guasMatrixRange * guasMatrixRange; + vector< float > gaussianBlurMatrix( gaussianBlurRange ); + + // The nd_range contains the total work (as mentioned previously) as + // well as the local work size (i.e. the number of threads in the local + // group). Here, we attempt to find a range close to the device's + // preferred size that also divides the global size neatly. + auto optRange = GetOptimalLocalRange( + range< 2 >{ guasMatrixRange, guasMatrixRange }, myQueue.get_device() ); + const nd_range< 2 > gaussianBlurNDRange( + range< 2 >{ guasMatrixRange, guasMatrixRange }, optRange ); + buffer bufGaussian( gaussianBlurMatrix ); + + // Enqueue KernelFillGaussian + myQueue.submit( [&]( handler &cgh ) + { + const auto ptrGBlur = + bufGaussian.get_access< access::mode::discard_write >( cgh ); + cgh.parallel_for< KernelFillGaussian >( gaussianBlurNDRange, + [=]( nd_item< 2 > item ) + { + // Get the 2D x and y indicies + const auto idX = item.get_global_id( 0 ); + const auto idY = item.get_global_id( 1 ); + const auto width = item.get_group_range( 0 ) * + item.get_local_range( 0 ); + const auto index = idX * width + idY; + const auto x = idX - guasDelta; + const auto y = idY - guasDelta; + float gausVallue = sycl::exp( -1.0f * (x*x + y*y) / guasStdDevFactor ); + gausVallue /= piFactor; + ptrGBlur[ index ] = gausVallue; + }); + }); + + // ******************************************************** + // Using gaussian convolution matrix, blur the input image. + // ******************************************************** + + // Images need a void * pointing to the data, and enums describing the + // type of the image (since a void * carries no type information). It + // also needs a range which describes the image's dimensions. + using co = sycl::image_channel_order; + using ct = sycl::image_channel_type; + // The image data has been returned us an unsigned char [], but due to + // OpenCL restrictions, we must use it as a void *. + void *pInputData = (void *) pInputImg.get(); + void *pOutputData = (void *) pOutputImg.get(); + // This range represents the full amount of work to be done across the + // image. We dispatch one thread per pixel. + range< 2 > imgRange( inputWidth, inputHeight ); + image< 2 > imageIn( pInputData, co::rgba, ct::unorm_int8, imgRange ); + image< 2 > imageOut( pOutputData, co::rgba, ct::unorm_int8, imgRange ); + optRange = GetOptimalLocalRange( imgRange, myQueue.get_device() ); + auto myRange = nd_range< 2 >( imgRange, optRange ); + constexpr auto offset = guasDelta; + + // Enqueue KernelGaussian + // Because of the dependency on the gaussian convolution grid, the call + // graph will automatically schedule this kernel to run after the + // KernelFillGaussian is complete. + myQueue.submit( [&]( handler &cgh ) + { + // Images still require accessors, like buffers, except the target is + // always access::target::image. + accessor< float4, 2, access::mode::read, access::target::image > + accImgInPtr( imageIn, cgh ); + accessor< float4, 2, access::mode::discard_write, access::target::image > + accImgOutPtr( imageOut, cgh ); + const auto ptrGBlur = + bufGaussian.get_access< access::mode::read >( cgh ); + + // The sampler is used to map user-provided co-ordinates to pixels in + // the image. + sampler smpl( coordinate_normalization_mode::unnormalized, + addressing_mode::none, filtering_mode::nearest ); + + // Setting breakpoints in the kernel code does not present the normal + // step through code behavior. Instead a breakpoint event is occurring + // on each thread being executed and so switches to the context of + // that thread. To step through the code of a single thread, use the + // Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the + // IDE's debug console prompt. As this is not the main thread, be sure + // to revert this setting on returning to debug any host side code. + // Use the command 'set scheduler-locking replay' or 'off'. + cgh.parallel_for< KernelGaussian >( myRange, [=](nd_item< 2 > item) + { + const auto idY = item.get_global_id( 1 ); + const auto idX = item.get_global_id( 0 ); + const auto outputCoords = int2( idX, idY ); + // A boundary is used so the convolution grid does not fall off the + // sides of the image. Keep it simple, just copy those pixels at the + // edges of the image. + const int hitY1 = idY - offset; + const int hitY2 = inputHeight - idY - offset; + const int hitX1 = idX - offset; + const int hitX2 = inputWidth - idX - offset; + const bool bBoundryY = (hitY1 < 0) || (hitY2 < 0); + const bool bBoundryX = (hitX1 < 0) || (hitX2 < 0); + float4 newPixel = float4( 0.0f, 0.0f, 0.0f, 0.0f ); + + if( !(bBoundryX || bBoundryY) ) + { + // Perform a convolution on a central pixel at idX idY + for( int x = 0; x < guasMatrixRange; x++ ) + { + for( int y = 0; y < guasMatrixRange; y++ ) + { + const auto index = x * guasMatrixRange + y; + const float value = ptrGBlur[ index ]; + const auto inputCoords = + int2( idX + x - offset, idY + y - offset ); + newPixel += accImgInPtr.read( inputCoords, smpl ) * value; + } + } + } + else + { + // Just duplicate the pixel at idX idY + const auto inputCoords = int2( idX, idY ); + newPixel = accImgInPtr.read( inputCoords, smpl ); + } + newPixel.w() = 1.0f; + accImgOutPtr.write( outputCoords, newPixel ); + + }); + }); + // The host/main thread is asked to wait here until all enqueued kernels + // have completed execution. + myQueue.wait_and_throw(); + } + // Synchronous errors are classical C++ exceptions + catch( sycl::exception const &e ) + { + + bProgramError = true; + cout << + "Wrap catch caught synchronous SYCL exception:\n" << e.what() << std::endl; + } + + if( bProgramError ) + { + std::cout << "Program failed." << std::endl; + return -1; + } + + // **************************** + // Output the new blurred image + // **************************** + + // Attempt to change the name from x.png or x.jpg to x-blurred.png. + // If the code cannot find a '.', it simply appends "-blurred" to the name. + std::string outputFilePath; + std::string inputName( argv[ 1 ] ); + auto pos = inputName.find_last_of( "." ); + if( pos == std::string::npos ) + { + outputFilePath = inputName + "-blurred"; + } + else + { + inputName.erase( pos, inputName.size() ); + outputFilePath = inputName + "-blurred" + ".png"; + } + + stbi_write_png( outputFilePath.c_str(), inputWidth, inputHeight, numChannels, + pOutputImg.get(), 0 ); + + std::cout << + "Program success, the image is successfully blurred!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json new file mode 100644 index 0000000000..2b8e56ba4b --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json @@ -0,0 +1,17 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "compilerArgs": [ "-fsycl" ], + "cStandard": "gnu17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json new file mode 100644 index 0000000000..6d3bc25b7d --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json @@ -0,0 +1,48 @@ +{ + "configurations": [ + { + "name": "C/C++: Intel icpx build and debug MonteCarloPi", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/bin/${config:programName}_d", + "args": ["${input:args}"], + "stopAtEntry": true, + "cwd": "${fileDirname}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + }, + { + "description": "Needed by Intel oneAPI: Disable target async", + "text": "set target-async off", + "ignoreFailures": true + } + ], + "preLaunchTask": "MonteCarloPi Debug C/C++: Intel icpx build active file", + "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi" + } + ], + "inputs" : [ + { + "id": "args", + "type": "pickString", + "description": "Program args", + "default": "cpu", + "options": [ + "cpu", + "gpu", + "accelerator" + ] + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json new file mode 100644 index 0000000000..a2e868b496 --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "programName": "MonteCarloPi" +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json new file mode 100644 index 0000000000..4383e21898 --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json @@ -0,0 +1,48 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cppbuild", + "label": "MonteCarloPi Debug C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-fdiagnostics-color=always", + "-fno-limit-debug-info", + "-g", + "-O0", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}_d" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx" + }, + { + "type": "cppbuild", + "label": "MonteCarloPi Release C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-DNDEBUG", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx" + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt new file mode 100644 index 0000000000..482e1862b1 --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt @@ -0,0 +1,16 @@ +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") + set(CMAKE_C_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "icpx") +endif() +set(CMAKE_CXX_STANDARD 17) +if(NOT DEFINED ${CMAKE_BUILD_TYPE}) + set(CMAKE_BUILD_TYPE "RELEASE") +endif() +if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +endif() +cmake_minimum_required (VERSION 3.4) +project (MonteCarloPi) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md new file mode 100644 index 0000000000..9b5ac8f04b --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md @@ -0,0 +1,85 @@ +# Monte Carlo Pi example program + +## Purpose +Monte Carlo Simulation is a broad category of computation that utilizes +statistical analysis to reach a result. This `Monte Carlo Pi` sample uses the +Monte Carlo Procedure to estimate the value of pi. + +## Prerequisites + +| Minimum Requirements | Description +|:--- |:--- +| OS | Linux* Ubuntu* 20.04.5 LTS +| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics +| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite) +| Libraries | Install Intel oneAPI Base Toolkit +| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns + +## Build and Run using Visual Code Studio + +### Linux* + +Within a terminal window change directory to this project's folder. At the +terminal prompt type: + +``` +cd MonteCarloPi_v2 +code . +``` + +Visual Studio Code will open this project displaying its files in the Explorer +pane. +The project is already set up with build configurations to build either a +debug build or a release build of the program. When a program is built, it is +placed in the bin directory of this project's top folder. + +To build the program hit Ctrl+Shift+b and choose the type of program to build. +The debug executable will have a '_d' appended to its name. + +To execute the program, type in the Visual Studio Code terminal window: +``` +cd bin +./MonteCarloPi_d cpu +``` + +## Build and Run using CMake +### Linux* +``` +mkdir build +cd build +cmake .. +make +``` + +To execute the program type in the terminal window: + +``` +cd build/src +./MonteCarloPi cpu +``` + +## Debug the program using Visual Studio Code + +### Linux* + +To debug the program, either choose from the IDE's run menu +'Start debugging' or hit F5 on the keyboard. +The debug launch.json configuration file defines the debug session to: +* To halt the program at the first line of code after main(). +Use the GUI debug panel's buttons to step over code (key F10) lines to see the +program advance. +Breakpoints can be set either in the main code or the kernel code. + +Note: Setting breakpoints in the kernel code does not present the normal + step through code behavior. Instead a breakpoint event is occurring + on each thread being executed and so switches to the context of + that thread. To step through the code of a single thread, use the + Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the + IDE's debug console prompt. As this is not the main thread, be sure + to revert this setting on returning to debug any host side code. + Use the command 'set scheduler-locking replay' or 'off'. + +## License + +Code samples are licensed under the Apache 2.0 license. See +[LICENSE.txt](LICENSE.txt) for details. diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt new file mode 100644 index 0000000000..a40a39873d --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt @@ -0,0 +1,6 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl") +set(CMAKE_BUILD_TYPE "RelWithDebInfo") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +add_executable (MonteCarloPi MonteCarloPi.cpp) +target_link_libraries(MonteCarloPi OpenCL sycl) + diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp new file mode 100644 index 0000000000..e8553405d3 --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp @@ -0,0 +1,338 @@ +//============================================================================ +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// =========================================================================== + +//**************************************************************************** +// +// Description: +// Example of Monte-Carlo Pi approximation algorithm in SYCL. Also, +// demonstrating how to query the maximum number of work-items in a +// work-group to check if a kernel can be executed with the initially +// desired work-group size. +// +// Usage: +// The program takes one argument: host / cpu / gpu / accelerator. +// +//***************************************************************************** + +// SYCL or oneAPI toolkit headers: +#include + +// Third party headers: +#include +#include +#include +#include +#include + +// In-house headers: +#include "device_selector.hpp" + +using namespace std; +using namespace sycl; + +// Forward declerations: +size_t GetBestWorkGroupSize( size_t work_group_size, + const sycl::device &device, + const sycl::kernel &kernel ); + +// Monte-Carlo Pi SYCL C++ functor +class CMonteCarloPiKernel +{ + template< typename dataT > + using readGlobalAccessor = sycl::accessor< + dataT, 1, + sycl::access::mode::read, + sycl::access::target::global_buffer >; + template < typename dataT > + using writeGlobalAccessor = sycl::accessor< + dataT, 1, + sycl::access::mode::write, + sycl::access::target::global_buffer >; + template< typename dataT > + using readWriteLocalAccessor = sycl::accessor< + dataT, 1, + sycl::access::mode::read_write, + sycl::access::target::local >; + public: + CMonteCarloPiKernel( readGlobalAccessor< sycl::cl_float2 > ptrPoints, + writeGlobalAccessor< sycl::cl_int > ptrResults, + readWriteLocalAccessor< sycl::cl_int > ptrResultsLocal ) + : m_ptrPoints( ptrPoints ), + m_ptrResults( ptrResults ), + m_ptrResultsLocal( ptrResultsLocal ) + {} + + // Functor kernel using a 1D ND-range of work items + void operator()( sycl::nd_item< 1 > item ) const + { + // Setting breakpoints in the kernel code does not present the normal + // step through code behavior. Instead a breakpoint event is occurring + // on each thread being executed and so switches to the context of + // that thread. To step through the code of a single thread, use the + // gdb-oneapi command 'set scheduler-locking step' or 'on' in the + // IDE's debug console prompt. As this is not the main thread, be sure + // to revert this setting on returning to debug any host side code. + // Use the command 'set scheduler-locking replay' or 'off'. + + const size_t idGlobal = item.get_global_id( 0 ); + const size_t idLocal = item.get_local_id( 0 ); + const size_t localDim = item.get_local_range( 0 ); + const size_t idGroup = item.get_group( 0 ); + + // Get the point to work on + const sycl::float2 point = m_ptrPoints[ idGlobal ]; + + // Calculate the length - built-in SYCL function + // length: sqrt(point.x * point.x + point.y * point.y) + const float len = sycl::length( point ); + + // Result is either 1 or 0 + m_ptrResultsLocal[ idLocal ] = (len <= 1.0f) ? 1 : 0; + + // Wait for the entire work group to get here. + item.barrier( sycl::access::fence_space::local_space ); + + // If work item 0 in work group, sum local values + if( idLocal == 0 ) + { + int sum = 0; + for( size_t i = 0; i < localDim; i++ ) + { + if( m_ptrResultsLocal[ i ] == 1 ) + { + ++sum; + } + } + + // Store the sum in global memory + m_ptrResults[ idGroup ] = sum; + } + } + + private: + readGlobalAccessor< sycl::cl_float2 > m_ptrPoints; + writeGlobalAccessor< sycl::cl_int > m_ptrResults; + readWriteLocalAccessor< sycl::cl_int > m_ptrResultsLocal; +}; + + +// Asynchronous errors hander, catch faults in asynchronously executed code +// inside a command group or a kernel. They can occur in a different stackframe, +// asynchronous error cannot be propagated up the stack. +// By default, they are considered 'lost'. The way in which we can retrieve them +// is by providing an error handler function. +auto exception_handler = []( sycl::exception_list exceptions ) +{ + for( std::exception_ptr const &e : exceptions ) + { + try + { + std::rethrow_exception( e ); + } + catch( sycl::exception const &e ) + { + std::cout << "Queue handler caught asynchronous SYCL exception:\n" + << e.what() << std::endl; + } + } +}; + +// The Monto Carlo Pi program +int main( int argc, char *argv[] ) +{ + CUtilDeviceTargets utilsDev; + FnResult fnResult = utilsDev.DiscoverDevsWeWant(); + if( !fnResult.bSuccess ) + { + cerr << "Program failure: Unable to discover target devices on this platform.\n"; + exit( -1 ); + } + + fnResult = UserCheckTheirInput( utilsDev, argc, argv ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( 1 ); + } + + bool bDoDevDiscovery = false; + fnResult = UserWantsToDiscoverPossibleTargets( argv, bDoDevDiscovery ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( -1 ); + } + if( bDoDevDiscovery ) exit( 1 ); + + const SDeviceFoundProxy *pUsersChosenDevice = utilsDev.GetDevUsersFirstChoice(); + if( pUsersChosenDevice == nullptr ) + { + cerr << "Program failure: Did not create a valid target device object.\n"; + exit( -1 ); + } + + constexpr size_t iterations = 1 << 20; + size_t workGroupSize = 1 << 10; + + // Container for the sum calculated per each work-group. + std::vector< sycl::cl_int > arrayResults; + + // Generate random points on the host - one point for each work item (thread) + std::vector< sycl::float2 > arrayPoints( iterations ); + // Fill up with (pseudo) random values in the range: [0, 1] + std::random_device r; + std::default_random_engine e( r() ); + std::uniform_real_distribution< float > dist; + std::generate( arrayPoints.begin(), arrayPoints.end(), + [&r, &e, &dist]() + { + return sycl::float2( dist( e ), dist( e ) ); + }); + + try + { + // Create a SYCL queue + queue queue( pUsersChosenDevice->theDevice, exception_handler ); + + string strTheDeviceBeingUsed; + fnResult = CUtilDeviceTargets::GetQueuesCurrentDevice( queue, strTheDeviceBeingUsed ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( -1 ); + } + cout << strTheDeviceBeingUsed << "\n"; + + // Get device and display information: name and platform + const sycl::device hw = queue.get_device(); + cout << "Selected " << hw.get_info< sycl::info::device::name >() + << " on platform " + << hw.get_info< sycl::info::device::platform >() + .get_info< sycl::info::platform::name >() + << std::endl; + + // Force online compilation of all kernels in the hwCntext now, + // unless already compiled for the device ahead-of-time. + const auto hwContext = queue.get_context(); + const sycl::kernel_id kernelID = + sycl::get_kernel_id< CMonteCarloPiKernel >(); + const auto hwKernelBundle = + sycl::get_kernel_bundle< sycl::bundle_state::executable >( hwContext ); + const sycl::kernel kernel = hwKernelBundle.get_kernel( kernelID ); + + // If the desired work-group size doesn't satisfy the device, define a + // perfect/max work-group depending on the selected device and kernel + // maximum size allowance. + workGroupSize = GetBestWorkGroupSize( workGroupSize, hw, kernel ); + + // Size of the total sums that are going to be stored in the results vector + // is set based on the defined work-group size. + arrayResults.resize( iterations / workGroupSize ); + + // Allocate device memory + sycl::buffer< sycl::cl_float2 > buffPoints( arrayPoints.data(), + sycl::range<1>( iterations ) ); + sycl::buffer< sycl::cl_int > buffResults( arrayResults.data(), + sycl::range< 1 >( iterations / workGroupSize ) ); + + queue.submit( [&](sycl::handler& cgh) + { + const size_t global_size = iterations; + const size_t local_size = workGroupSize; + + // Get access to the data (points and results) on the device + const auto ptrPoints = + buffPoints.get_access( cgh ); + const auto ptrResults = + buffResults.get_access< sycl::access::mode::write >( cgh ); + + // Allocate local memory on the device (to compute results) + const sycl::accessor< sycl::cl_int, 1, sycl::access::mode::read_write, + sycl::access::target::local > + ptrResultsLocal( sycl::range< 1 >( local_size ), cgh ); + + // Run the kernel + cgh.parallel_for( + sycl::nd_range< 1 >( sycl::range< 1 >( global_size ), + sycl::range< 1 >( local_size ) ), + CMonteCarloPiKernel( ptrPoints, ptrResults, ptrResultsLocal ) ); + }); + } + catch( const sycl::exception &e ) + { + std::cerr << "SYCL exception caught: " << e.what() << std::endl; + return 1; + } + catch( const std::exception &e ) + { + std::cerr << "C++ exception caught: " << e.what() << std::endl; + return 2; + } + + // Sum the results (auto copied back to host) + int inCircle = 0; + for( int &result : arrayResults ) + { + inCircle += result; + } + + // Calculate the final result of "pi" + float pi = (4.0f * inCircle) / iterations; + std::cout << "pi = " << pi << std::endl; + + return 0; +} + + +// A helper to define a "perfect" work-group size dependant on selected device +// and kernel maximum allowance. +size_t GetBestWorkGroupSize( const size_t workGroupSize, + const sycl::device &device, + const sycl::kernel &kernel ) +{ + if( device.is_cpu() ) + { + const size_t maxDeviceWorkGroupSize = + device.get_info< sycl::info::device::max_work_group_size >(); + + // Check if the desired work-group size will be allowed on the host device + // and query the maximum possible size on that device in case the desired + // one is more than the allowed. + if( workGroupSize > maxDeviceWorkGroupSize ) + { + cout << "Maximum work-group size for device " + << device.get_info< sycl::info::device::name >() << ": " + << maxDeviceWorkGroupSize << std::endl; + + return maxDeviceWorkGroupSize; + } + + return workGroupSize; + } + else + { + const size_t maxKernelWorkGroupSize = kernel.get_info< + sycl::info::kernel_device_specific::work_group_size >( device ); + + // Verify if the kernel can be executed with our desired work-group size, + // and if it can't use the maximum allowed kernel work-group size for the + // selected device. + if( workGroupSize > maxKernelWorkGroupSize ) + { + cout << "Maximum work-group size for " + << typeid( CMonteCarloPiKernel ).name() << " on device " + << device.get_info() << ": " + << maxKernelWorkGroupSize << "\n"; + + return maxKernelWorkGroupSize; + } + + // Otherwise, the work-size will stay the originally desired one + return workGroupSize; + } +} + diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp new file mode 100644 index 0000000000..a515c547be --- /dev/null +++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp @@ -0,0 +1,421 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +// SYCL or oneAPI toolkit headers: +#include + +// Third party headers: +#include + +using namespace std; +using namespace sycl; + +//++ +//============================================================================ +// Details: Common code utility. User defined enumerate of the SYCL device +// target typesdesired. +// An 'eHost' is admitted because SYCL 2020 depricates host device +// selection. +//-- +enum EDevsWeWant +{ + eNotValid = 0, // Default + eCPU = 1, + eGPU = 2, + eAccelerator = 3, // i.e. a FPGA type device + eCount = 4 // Always the last one +}; + +//++ +//============================================================================ +// Details: Common code utility. User defined target device proxy. +// After the utility has discoverd available devices on the system, +// this structure holds/caches information about the device. +// Forms a proxy device object representing an actual possible target +// device found on the system. +//-- +struct SDeviceFoundProxy final +{ + EDevsWeWant eDevice = eNotValid; // The type of real device we want to + // use to run kernels on. + string strDeviceName = ""; // THe proxy label (ID) for a real + // device. + bool bAvailable = false; // True = can be used, + // False = not found on the system. + bool bActiveTarget = false; // True = use it, false = stop using. + int nScore = 0; // User defined score of the device. + sycl::device theDevice; // Copy of the real device found. +}; + +//++ +//============================================================================ +// Details: Common code utility. Rudimentry error reporting system. Used by +// utility class to explicity aid the user or the programmer of any +// issues that have occurred. +//-- +struct FnResult final +{ + bool bSuccess = true; + string strErrMsg = ""; +}; + +//++ +//============================================================================ +// Details: Common code utility. A basic utility class to wrap up functions +// that can discover, then target the acceleration devices found on +// a system. +// +// Docs: https://www.intel.com/content/www/us/en/developer/articles/ +// technical/device-discovery-with-sycl.html#gs.nhyd7s +// https://registry.khronos.org/SYCL/specs/sycl-2020/html/ +// sycl-2020.html#sec:device-selection +// +// It can find all the available device targets on a system. +// But it will only store a list of the first device found of the following +// type and criteria: cpu, gpu and accelerator. +// Each device in the list can be set to be an active target. +// The last or latest call to function SetDevToActive() will change the +// device proxy object returned by GetDevUsersFirstChoice() to be that device. +// -- +class CUtilDeviceTargets final +{ + // Definitions: + public: + typedef std::vector< SDeviceFoundProxy > ListDevicesFound_t; + + // Static method: + public: + static FnResult DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices ); + static const string& GetInputOptionDiscoverDevice(); + static FnResult GetQueuesCurrentDevice( const queue &vrQ, string &vrstr ); + + // Methods: + public: + CUtilDeviceTargets(); + ~CUtilDeviceTargets(); + + FnResult DiscoverDevsWeWant(); + const ListDevicesFound_t &GetListDevs() const; + const SDeviceFoundProxy *GetDevUsersFirstChoice() const; + FnResult SetDevToActive( const string &rvDeviceName, const bool vbActive ); + + // Attributes: + private: + ListDevicesFound_t m_listDeviceTargets; + static string m_strDiscoverDeviceInputOption; + SDeviceFoundProxy *m_pDeviceUserFirstChoice; // NULL = a choice has not been made +}; + +// Instantiations: +string CUtilDeviceTargets::m_strDiscoverDeviceInputOption = "discover_devices"; + +//++ +// Details: CUtilDeviceTargets constructor. +// Type: Method. +// Args: None. +// Return: None. +// Throws: None. +//-- +CUtilDeviceTargets::CUtilDeviceTargets() +: m_pDeviceUserFirstChoice( nullptr ) +{} + +//++ +// Details: CUtilDeviceTargets destructor. +// Type: Method. +// Args: None. +// Return: None. +// Throws: None. +//-- +CUtilDeviceTargets::~CUtilDeviceTargets() +{ + // Release + m_pDeviceUserFirstChoice = nullptr; +} + +//++ +// Details: Return a report on the specified SYCL queue stating its current +// real target device and the device platform. +// +// Type: Method. +// Args: vrQ - (R) The queue to query. +// vrstrReport - (W) The report text. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::GetQueuesCurrentDevice( const queue &vrQ, string &vrstrReport ) +{ + FnResult status; + + vrstrReport = "[SYCL] Using device: ["; + vrstrReport += vrQ.get_device().get_info< info::device::name >(); + vrstrReport += "] from ["; + vrstrReport += vrQ.get_device().get_platform().get_info< info::platform::name >(); + vrstrReport += "]"; + + return status; +} + +//++ +// Details: Returns the program's text label for the user input option +// to choose to discover all the available device on a system. +// Type: Method. +// Args: None. +// Return: string& - Text label. +// Throws: None. +//-- +const string& CUtilDeviceTargets::GetInputOptionDiscoverDevice() +{ + return m_strDiscoverDeviceInputOption; +} + +//++ +// Details: Returns the pointer to the proxy object in the list of +// discovery device proxies. +// Type: Method. +// Args: None. +// Return: SDeviceFoundProxy* - pointer to object. +// Throws: None. +//-- +const SDeviceFoundProxy * CUtilDeviceTargets::GetDevUsersFirstChoice() const +{ + return m_pDeviceUserFirstChoice; +} + +//++ +// Details: A pointer to the current user's choice of target is made on this +// function being successful. +// +// If this function fails, the pointer retains the last or +// remains NULL (a choice was never made). +// +// Type: Method. +// Args: string &rvDeviceName - (R) Proxy's label or ID text. +// bool vbActive - (R) True = Use the device, False = disable use. +// Return: FnResult - Status of the function's operational success. +// Throws: None. +//-- +FnResult CUtilDeviceTargets::SetDevToActive( const string &rvDeviceName, const bool vbActive ) +{ + FnResult status; + + bool bFoundDevice = false; + for( SDeviceFoundProxy &rDev : m_listDeviceTargets ) + { + if( rDev.strDeviceName == rvDeviceName ) + { + bFoundDevice = true; + rDev.bActiveTarget = vbActive; + m_pDeviceUserFirstChoice = &rDev; + break; + } + } + if( !bFoundDevice ) + { + status.bSuccess = false; + status.strErrMsg = "Device '" + rvDeviceName; + status.strErrMsg += "' not found in list of available device targets"; + } + + return status; +} + +//++ +// Details: Discovers all the SYCL target devices available and assigns them +// to a target device proxy object. A device proxy object holds +// the criteria for a real device. All proxies created are +// disabled until a real device is found to match it. A programmer +// has to still set the proxy device as active to target that +// device it represent. +// +// Call this function before at the earliest opportunity and +// before other functions in this class as it makes a list +// of the target devices we are aiming to use. +// +// A limitation of this function it will only assign the first real device that +// matches the proxy criteria. Any subsequent same or similar devices are +// ignored. +// +// Type: Method. +// Args: None. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::DiscoverDevsWeWant() +{ + FnResult status; + + SDeviceFoundProxy accelerator{ eAccelerator, "accelerator", false }; + SDeviceFoundProxy cpu{ eCPU, "cpu", false }; + SDeviceFoundProxy gpu{ eGPU, "gpu", false }; + + for( const auto platform : platform::get_platforms() ) + { + for( const auto device : platform.get_devices() ) + { + // Get first available device of each type + if( !accelerator.bAvailable && device.is_accelerator() ) + { + accelerator.bAvailable = true; + accelerator.theDevice = device; + } + else if( !cpu.bAvailable && device.is_cpu() ) + { + cpu.bAvailable = true; + cpu.theDevice = device; + } + else if( !gpu.bAvailable && device.is_gpu() ) + { + gpu.bAvailable = true; + gpu.theDevice = device; + } + } + } + + m_listDeviceTargets.push_back( accelerator ); + m_listDeviceTargets.push_back( cpu ); + m_listDeviceTargets.push_back( gpu ); + + return status; +} + +//++ +// Details: Returns the list of proxy device objects the programmer has +// defined and wants found on the system. Some proxy objects +// may be set to not available (and inactive) if not matching +// devices has been found on the system. +// Type: Method. +// Args: None. +// Return: ListDevicesFound_t - List of proxy device objects. +// Throws: None. +//-- +const CUtilDeviceTargets::ListDevicesFound_t & CUtilDeviceTargets::GetListDevs() const +{ + return m_listDeviceTargets; +} + +//++ +// Details: Prints to std out all the SYCL device targets discovered on the +// wanted to be used. +// Type: Method. +// Args: string& vrstrPlatformAndDevices - (W) A report of found devices. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices ) +{ + FnResult status; + + vrstrPlatformAndDevices = ""; + bool bFoundPlatforms = false; + bool bFoundDevices = false; + for( const auto platform : platform::get_platforms() ) + { + bFoundPlatforms = true; + vrstrPlatformAndDevices += "Platform: "; + vrstrPlatformAndDevices += platform.get_info< info::platform::name >(); + vrstrPlatformAndDevices += "\n"; + + for( const auto device : platform.get_devices() ) + { + bFoundDevices = true; + vrstrPlatformAndDevices += "\tDevice: "; + vrstrPlatformAndDevices += device.get_info< info::device::name >(); + vrstrPlatformAndDevices += "\n"; + } + } + if( !bFoundPlatforms && !bFoundDevices ) + { + vrstrPlatformAndDevices = "No SYCL targeted platforms or devices found."; + } + + return status; +} + +//++ +// Details: Checks the user's input is valid. If not a help message if formed +// and returned. If valid, the matching proxy device object +// discovered earlier is made active for use by the program. +// Type: Function. +// Args: vrDevList- (RW) Utililty object managing proxy device objects. +// argc - (R) Program's input arguments count. +// argv - (R) Program's list of input arguments. +// Return: FnResult - Status of the function's operational success. +// Throws: None. +//-- +FnResult UserCheckTheirInput( CUtilDeviceTargets &vrDevList, int argc, char* argv[] ) +{ + FnResult status; + + const CUtilDeviceTargets::ListDevicesFound_t &rDevs = vrDevList.GetListDevs(); + string strListDevsOptionsToUser; + for( const SDeviceFoundProxy d : rDevs ) + { + strListDevsOptionsToUser += d.strDeviceName + "|"; + } + strListDevsOptionsToUser += CUtilDeviceTargets::GetInputOptionDiscoverDevice(); + + if( argc < 2 ) + { + status.bSuccess = false; + status.strErrMsg = "Usage: " + string( argv[ 0 ] ) + " <"; + status.strErrMsg += strListDevsOptionsToUser; + status.strErrMsg += ">"; + return status; + } + + bool bTargetDevMatch = false; + const string strArg{ argv[ 1 ] }; + for( const SDeviceFoundProxy d : rDevs ) + { + if( strArg == d.strDeviceName ) + { + bTargetDevMatch = true; + status = vrDevList.SetDevToActive( strArg, true ); + break; + } + } + if( status.bSuccess && !bTargetDevMatch && + (strArg != CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) ) + { + status.bSuccess = false; + status.strErrMsg = "The device type cannot be found. Please enter a device type name from the list: "; + status.strErrMsg += strListDevsOptionsToUser; + } + + return status; +} + +//++ +// Details: Checks the user's input is the option to 'discover device target' +// on the system. +// Type: Function. +// Args: argv - (R) Program's list of input arguments. +// bool rbDoDiscovery - (W) True = yes, the discovery option choosen. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult UserWantsToDiscoverPossibleTargets( char* argv[], bool &rbDoDiscovery ) +{ + FnResult status; + + rbDoDiscovery = false; + const string strArg{ argv[ 1 ] }; + if( strArg == CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) + { + string strPlatformAndDevicesReport; + status = CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( strPlatformAndDevicesReport ); + if( status.bSuccess ) + { + rbDoDiscovery = true; + cout << strPlatformAndDevicesReport << std::endl; + } + } + + return status; +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json new file mode 100644 index 0000000000..2b8e56ba4b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json @@ -0,0 +1,17 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "compilerArgs": [ "-fsycl" ], + "cStandard": "gnu17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json new file mode 100644 index 0000000000..325c8901a1 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json @@ -0,0 +1,50 @@ +{ + "configurations": [ + { + "name": "C/C++: dpc++ build and debug scan", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/bin/${config:programName}_d", + "args": [ + "${input:args}" + ], + "stopAtEntry": true, + "cwd": "${fileDirname}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Enable pretty-printing for gdb", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "Set Disassembly Flavor to Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + }, + { + "description": "Needed by Intel oneAPI: Disable target async", + "text": "set target-async off", + "ignoreFailures": true + } + ], + "preLaunchTask": "scan Debug C/C++: Intel icpx build active file", + "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi" + } + ], + "inputs" : [ + { + "id": "args", + "type": "pickString", + "description": "Program args", + "default": "cpu", + "options": [ + "cpu", + "gpu", + "accelerator" + ] + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json new file mode 100644 index 0000000000..150d2ba955 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json @@ -0,0 +1,14 @@ +{ + "programName": "scan", + "files.associations": { + "stdexcept": "cpp", + "array": "cpp", + "bitset": "cpp", + "string_view": "cpp", + "initializer_list": "cpp", + "regex": "cpp", + "utility": "cpp", + "algorithm": "cpp", + "iostream": "cpp" + } +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json new file mode 100644 index 0000000000..0706d2eebb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json @@ -0,0 +1,49 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "type": "cppbuild", + "label": "scan Debug C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-fdiagnostics-color=always", + "-fno-limit-debug-info", + "-fsycl-device-code-split=per_kernel", + "-g", + "-O0", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}_d" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/dpcpp" + }, + { + "type": "cppbuild", + "label": "scan Release C/C++: Intel icpx build active file", + "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx", + "args": [ + "-fsycl", + "-DNDEBUG", + "${workspaceFolder}/src/${config:programName}.cpp", + "-o", + "${workspaceFolder}/bin/${config:programName}" + ], + "options": { + "cwd": "${workspaceFolder}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx" + } + ] +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt new file mode 100644 index 0000000000..24aa89e2f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt @@ -0,0 +1,16 @@ +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") + set(CMAKE_C_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "icpx") +endif() +set(CMAKE_CXX_STANDARD 17) +if(NOT DEFINED ${CMAKE_BUILD_TYPE}) + set(CMAKE_BUILD_TYPE "RELEASE") +endif() +if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +endif() +cmake_minimum_required (VERSION 3.4) +project (scan) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md new file mode 100644 index 0000000000..3f1e3f3720 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md @@ -0,0 +1,86 @@ +# Parallel Prefix Sum (Scan) program + +## Purpose +A simple and common parallel algorithm building block is the all-prefix-sums operation. Blelloch (1990) describes all-prefix-sums as a good example of a computation that seems inherently sequential, but for which there is an efficient parallel algorithm. + +## Prerequisites + +| Minimum Requirements | Description +|:--- |:--- +| OS | Linux* Ubuntu* 20.04.5 LTS +| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics +| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite) +| Libraries | Install Intel oneAPI Base Toolkit +| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns + +## Build and Run using Visual Code Studio + +### Linux* + +Within a terminal window change directory to this project's folder. At the +terminal prompt type: + +``` +cd ParallelPrefixSumScan +code . +``` + +Visual Studio Code will open this project displaying its files in the Explorer +pane. +The project is already set up with build configurations to build either a +debug build or a release build of the program. When a program is built, it is +placed in the bin directory of this project's top folder. + +To build the program hit Ctrl+Shift+b and choose the type of program to build. +The debug executable will have a '_d' appended to its name. + +To execute the program, type in the Visual Studio Code terminal window: +``` +cd bin +./scan_d cpu +``` + +## Build and Run using CMake +### Linux* +``` +mkdir build +cd build +cmake .. +make +``` + +To execute the program, type in the terminal window: + +``` +cd build/src +./scan cpu +``` + + +## Debug the program using Visual Studio Code + +### Linux* + +To debug the program, either choose from the IDE's run menu +'Start debugging' or hit F5 on the keyboard. +The debug launch.json configuration file defines the debug session to: +* Provide a list of accelerator type to choose from. Picking a device will + be used as the first argument to the program. +* To halt the program at the first line of code after main(). +Use the GUI debug panel's buttons to step over code (key F10) lines to see the +program advance. +Breakpoints can be set either in the main code or the kernel code. + +Note: Setting breakpoints in the kernel code does not present the normal + step through code behavior. Instead a breakpoint event is occurring + on each thread being executed and so switches to the context of + that thread. To step through the code of a single thread, use the + Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the + IDE's debug console prompt. As this is not the main thread, be sure + to revert this setting on returning to debug any host side code. + Use the command 'set scheduler-locking replay' or 'off'. + +## License + +Code samples are licensed under the Apache 2.0 license. See +[LICENSE.txt](LICENSE.txt) for details. diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt new file mode 100644 index 0000000000..7dbc6eff3d --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt @@ -0,0 +1,6 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl") +set(CMAKE_BUILD_TYPE "RelWithDebInfo") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +add_executable (scan scan.cpp) +target_link_libraries(scan OpenCL sycl) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp new file mode 100644 index 0000000000..a515c547be --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp @@ -0,0 +1,421 @@ +//============================================================== +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +// SYCL or oneAPI toolkit headers: +#include + +// Third party headers: +#include + +using namespace std; +using namespace sycl; + +//++ +//============================================================================ +// Details: Common code utility. User defined enumerate of the SYCL device +// target typesdesired. +// An 'eHost' is admitted because SYCL 2020 depricates host device +// selection. +//-- +enum EDevsWeWant +{ + eNotValid = 0, // Default + eCPU = 1, + eGPU = 2, + eAccelerator = 3, // i.e. a FPGA type device + eCount = 4 // Always the last one +}; + +//++ +//============================================================================ +// Details: Common code utility. User defined target device proxy. +// After the utility has discoverd available devices on the system, +// this structure holds/caches information about the device. +// Forms a proxy device object representing an actual possible target +// device found on the system. +//-- +struct SDeviceFoundProxy final +{ + EDevsWeWant eDevice = eNotValid; // The type of real device we want to + // use to run kernels on. + string strDeviceName = ""; // THe proxy label (ID) for a real + // device. + bool bAvailable = false; // True = can be used, + // False = not found on the system. + bool bActiveTarget = false; // True = use it, false = stop using. + int nScore = 0; // User defined score of the device. + sycl::device theDevice; // Copy of the real device found. +}; + +//++ +//============================================================================ +// Details: Common code utility. Rudimentry error reporting system. Used by +// utility class to explicity aid the user or the programmer of any +// issues that have occurred. +//-- +struct FnResult final +{ + bool bSuccess = true; + string strErrMsg = ""; +}; + +//++ +//============================================================================ +// Details: Common code utility. A basic utility class to wrap up functions +// that can discover, then target the acceleration devices found on +// a system. +// +// Docs: https://www.intel.com/content/www/us/en/developer/articles/ +// technical/device-discovery-with-sycl.html#gs.nhyd7s +// https://registry.khronos.org/SYCL/specs/sycl-2020/html/ +// sycl-2020.html#sec:device-selection +// +// It can find all the available device targets on a system. +// But it will only store a list of the first device found of the following +// type and criteria: cpu, gpu and accelerator. +// Each device in the list can be set to be an active target. +// The last or latest call to function SetDevToActive() will change the +// device proxy object returned by GetDevUsersFirstChoice() to be that device. +// -- +class CUtilDeviceTargets final +{ + // Definitions: + public: + typedef std::vector< SDeviceFoundProxy > ListDevicesFound_t; + + // Static method: + public: + static FnResult DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices ); + static const string& GetInputOptionDiscoverDevice(); + static FnResult GetQueuesCurrentDevice( const queue &vrQ, string &vrstr ); + + // Methods: + public: + CUtilDeviceTargets(); + ~CUtilDeviceTargets(); + + FnResult DiscoverDevsWeWant(); + const ListDevicesFound_t &GetListDevs() const; + const SDeviceFoundProxy *GetDevUsersFirstChoice() const; + FnResult SetDevToActive( const string &rvDeviceName, const bool vbActive ); + + // Attributes: + private: + ListDevicesFound_t m_listDeviceTargets; + static string m_strDiscoverDeviceInputOption; + SDeviceFoundProxy *m_pDeviceUserFirstChoice; // NULL = a choice has not been made +}; + +// Instantiations: +string CUtilDeviceTargets::m_strDiscoverDeviceInputOption = "discover_devices"; + +//++ +// Details: CUtilDeviceTargets constructor. +// Type: Method. +// Args: None. +// Return: None. +// Throws: None. +//-- +CUtilDeviceTargets::CUtilDeviceTargets() +: m_pDeviceUserFirstChoice( nullptr ) +{} + +//++ +// Details: CUtilDeviceTargets destructor. +// Type: Method. +// Args: None. +// Return: None. +// Throws: None. +//-- +CUtilDeviceTargets::~CUtilDeviceTargets() +{ + // Release + m_pDeviceUserFirstChoice = nullptr; +} + +//++ +// Details: Return a report on the specified SYCL queue stating its current +// real target device and the device platform. +// +// Type: Method. +// Args: vrQ - (R) The queue to query. +// vrstrReport - (W) The report text. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::GetQueuesCurrentDevice( const queue &vrQ, string &vrstrReport ) +{ + FnResult status; + + vrstrReport = "[SYCL] Using device: ["; + vrstrReport += vrQ.get_device().get_info< info::device::name >(); + vrstrReport += "] from ["; + vrstrReport += vrQ.get_device().get_platform().get_info< info::platform::name >(); + vrstrReport += "]"; + + return status; +} + +//++ +// Details: Returns the program's text label for the user input option +// to choose to discover all the available device on a system. +// Type: Method. +// Args: None. +// Return: string& - Text label. +// Throws: None. +//-- +const string& CUtilDeviceTargets::GetInputOptionDiscoverDevice() +{ + return m_strDiscoverDeviceInputOption; +} + +//++ +// Details: Returns the pointer to the proxy object in the list of +// discovery device proxies. +// Type: Method. +// Args: None. +// Return: SDeviceFoundProxy* - pointer to object. +// Throws: None. +//-- +const SDeviceFoundProxy * CUtilDeviceTargets::GetDevUsersFirstChoice() const +{ + return m_pDeviceUserFirstChoice; +} + +//++ +// Details: A pointer to the current user's choice of target is made on this +// function being successful. +// +// If this function fails, the pointer retains the last or +// remains NULL (a choice was never made). +// +// Type: Method. +// Args: string &rvDeviceName - (R) Proxy's label or ID text. +// bool vbActive - (R) True = Use the device, False = disable use. +// Return: FnResult - Status of the function's operational success. +// Throws: None. +//-- +FnResult CUtilDeviceTargets::SetDevToActive( const string &rvDeviceName, const bool vbActive ) +{ + FnResult status; + + bool bFoundDevice = false; + for( SDeviceFoundProxy &rDev : m_listDeviceTargets ) + { + if( rDev.strDeviceName == rvDeviceName ) + { + bFoundDevice = true; + rDev.bActiveTarget = vbActive; + m_pDeviceUserFirstChoice = &rDev; + break; + } + } + if( !bFoundDevice ) + { + status.bSuccess = false; + status.strErrMsg = "Device '" + rvDeviceName; + status.strErrMsg += "' not found in list of available device targets"; + } + + return status; +} + +//++ +// Details: Discovers all the SYCL target devices available and assigns them +// to a target device proxy object. A device proxy object holds +// the criteria for a real device. All proxies created are +// disabled until a real device is found to match it. A programmer +// has to still set the proxy device as active to target that +// device it represent. +// +// Call this function before at the earliest opportunity and +// before other functions in this class as it makes a list +// of the target devices we are aiming to use. +// +// A limitation of this function it will only assign the first real device that +// matches the proxy criteria. Any subsequent same or similar devices are +// ignored. +// +// Type: Method. +// Args: None. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::DiscoverDevsWeWant() +{ + FnResult status; + + SDeviceFoundProxy accelerator{ eAccelerator, "accelerator", false }; + SDeviceFoundProxy cpu{ eCPU, "cpu", false }; + SDeviceFoundProxy gpu{ eGPU, "gpu", false }; + + for( const auto platform : platform::get_platforms() ) + { + for( const auto device : platform.get_devices() ) + { + // Get first available device of each type + if( !accelerator.bAvailable && device.is_accelerator() ) + { + accelerator.bAvailable = true; + accelerator.theDevice = device; + } + else if( !cpu.bAvailable && device.is_cpu() ) + { + cpu.bAvailable = true; + cpu.theDevice = device; + } + else if( !gpu.bAvailable && device.is_gpu() ) + { + gpu.bAvailable = true; + gpu.theDevice = device; + } + } + } + + m_listDeviceTargets.push_back( accelerator ); + m_listDeviceTargets.push_back( cpu ); + m_listDeviceTargets.push_back( gpu ); + + return status; +} + +//++ +// Details: Returns the list of proxy device objects the programmer has +// defined and wants found on the system. Some proxy objects +// may be set to not available (and inactive) if not matching +// devices has been found on the system. +// Type: Method. +// Args: None. +// Return: ListDevicesFound_t - List of proxy device objects. +// Throws: None. +//-- +const CUtilDeviceTargets::ListDevicesFound_t & CUtilDeviceTargets::GetListDevs() const +{ + return m_listDeviceTargets; +} + +//++ +// Details: Prints to std out all the SYCL device targets discovered on the +// wanted to be used. +// Type: Method. +// Args: string& vrstrPlatformAndDevices - (W) A report of found devices. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices ) +{ + FnResult status; + + vrstrPlatformAndDevices = ""; + bool bFoundPlatforms = false; + bool bFoundDevices = false; + for( const auto platform : platform::get_platforms() ) + { + bFoundPlatforms = true; + vrstrPlatformAndDevices += "Platform: "; + vrstrPlatformAndDevices += platform.get_info< info::platform::name >(); + vrstrPlatformAndDevices += "\n"; + + for( const auto device : platform.get_devices() ) + { + bFoundDevices = true; + vrstrPlatformAndDevices += "\tDevice: "; + vrstrPlatformAndDevices += device.get_info< info::device::name >(); + vrstrPlatformAndDevices += "\n"; + } + } + if( !bFoundPlatforms && !bFoundDevices ) + { + vrstrPlatformAndDevices = "No SYCL targeted platforms or devices found."; + } + + return status; +} + +//++ +// Details: Checks the user's input is valid. If not a help message if formed +// and returned. If valid, the matching proxy device object +// discovered earlier is made active for use by the program. +// Type: Function. +// Args: vrDevList- (RW) Utililty object managing proxy device objects. +// argc - (R) Program's input arguments count. +// argv - (R) Program's list of input arguments. +// Return: FnResult - Status of the function's operational success. +// Throws: None. +//-- +FnResult UserCheckTheirInput( CUtilDeviceTargets &vrDevList, int argc, char* argv[] ) +{ + FnResult status; + + const CUtilDeviceTargets::ListDevicesFound_t &rDevs = vrDevList.GetListDevs(); + string strListDevsOptionsToUser; + for( const SDeviceFoundProxy d : rDevs ) + { + strListDevsOptionsToUser += d.strDeviceName + "|"; + } + strListDevsOptionsToUser += CUtilDeviceTargets::GetInputOptionDiscoverDevice(); + + if( argc < 2 ) + { + status.bSuccess = false; + status.strErrMsg = "Usage: " + string( argv[ 0 ] ) + " <"; + status.strErrMsg += strListDevsOptionsToUser; + status.strErrMsg += ">"; + return status; + } + + bool bTargetDevMatch = false; + const string strArg{ argv[ 1 ] }; + for( const SDeviceFoundProxy d : rDevs ) + { + if( strArg == d.strDeviceName ) + { + bTargetDevMatch = true; + status = vrDevList.SetDevToActive( strArg, true ); + break; + } + } + if( status.bSuccess && !bTargetDevMatch && + (strArg != CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) ) + { + status.bSuccess = false; + status.strErrMsg = "The device type cannot be found. Please enter a device type name from the list: "; + status.strErrMsg += strListDevsOptionsToUser; + } + + return status; +} + +//++ +// Details: Checks the user's input is the option to 'discover device target' +// on the system. +// Type: Function. +// Args: argv - (R) Program's list of input arguments. +// bool rbDoDiscovery - (W) True = yes, the discovery option choosen. +// Return: FnResult - Status of the function's operational success. +// Throws: SYCL implemenation may throw. +//-- +FnResult UserWantsToDiscoverPossibleTargets( char* argv[], bool &rbDoDiscovery ) +{ + FnResult status; + + rbDoDiscovery = false; + const string strArg{ argv[ 1 ] }; + if( strArg == CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) + { + string strPlatformAndDevicesReport; + status = CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( strPlatformAndDevicesReport ); + if( status.bSuccess ) + { + rbDoDiscovery = true; + cout << strPlatformAndDevicesReport << std::endl; + } + } + + return status; +} \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp new file mode 100644 index 0000000000..d179444ee4 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp @@ -0,0 +1,520 @@ +//============================================================================ +// Copyright © 2022 Intel Corporation +// +// SPDX-License-Identifier: MIT +// =========================================================================== + +//**************************************************************************** +// +// Description: +// Example of a parallel inclusive scan in SYCL. Based on the two-phase +// exclusive scan algorithm paper by Guy E. Blelloch titled "Prefix Sums and +// Their Applications", 1990. +// +// Usage: +// The program takes one argument: host / cpu / gpu / accelerator. +// +//***************************************************************************** + +// SYCL / Intel oneAPI files: +#include +#include "dpc_common.hpp" + +// Third party files: +#include +#include +#include +#include + +// This project's files: +#include "device_selector.hpp" + +using namespace sycl; +using namespace std; + +// Asynchronous errors hander, catch faults in asynchronously executed code +// inside a command group or a kernel. They can occur in a different stackframe, +// asynchronous error cannot be propagated up the stack. +// By default, they are considered 'lost'. The way in which we can retrieve them +// is by providing an error handler function. +auto exception_handler = []( sycl::exception_list exceptions ) +{ + for( std::exception_ptr const &e : exceptions ) + { + try + { + std::rethrow_exception( e ); + } + catch( sycl::exception const &e ) + { + std::cout << "Queue handler caught asynchronous SYCL exception:\n" << e.what() << std::endl; + } + } +}; + +// Forward decleration of functions +template< typename T, typename OP > +void ParallelScan( sycl::buffer< T, 1 > &bufIn, sycl::queue &q ); +int TestSum( sycl::queue &q ); +int TestFactorial( sycl::queue &q ); + +int main( int argc, char *argv[] ) +{ + CUtilDeviceTargets utilsDev; + FnResult fnResult = utilsDev.DiscoverDevsWeWant(); + if( !fnResult.bSuccess ) + { + cerr << "Program failure: Unable to discover target devices on this platform.\n"; + exit( -1 ); + } + + fnResult = UserCheckTheirInput( utilsDev, argc, argv ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( 1 ); + } + + bool bDoDevDiscovery = false; + fnResult = UserWantsToDiscoverPossibleTargets( argv, bDoDevDiscovery ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( -1 ); + } + if( bDoDevDiscovery ) exit( 1 ); + + const SDeviceFoundProxy *pUsersChosenDevice = utilsDev.GetDevUsersFirstChoice(); + if( pUsersChosenDevice == nullptr ) + { + cerr << "Program failure: Did not create a valid target device object.\n"; + exit( -1 ); + } + + int retResultSum = 0; + int retResultFactorial = 0; + + try + { + queue myQueue( pUsersChosenDevice->theDevice, exception_handler ); + + string strTheDeviceBeingUsed; + fnResult = CUtilDeviceTargets::GetQueuesCurrentDevice( myQueue, strTheDeviceBeingUsed ); + if( !fnResult.bSuccess ) + { + cerr << fnResult.strErrMsg << "\n"; + exit( -1 ); + } + cout << strTheDeviceBeingUsed << "\n"; + + retResultSum = TestSum( myQueue ); + retResultFactorial = (retResultSum == 0) && TestFactorial( myQueue ); + } + catch( sycl::exception const &e ) + { + cout << "Fail; SYCL synchronous exception occurred: " << e.what() << "\n"; + return -1; + } + catch( std::exception const &e ) + { + cout << "Fail; Runtime synchronous exception occurred: " << e.what() << "\n"; + return -1; + } + + if( (retResultSum != 0) || (retResultFactorial != 0) ) + { + return 1; + } + + cout << "Results are correct." << std::endl; + + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// + +// The identity element for a given operation. +template< typename T, typename OP > +struct SIdentity {}; + +template< typename T > +struct SIdentity< T, std::plus< T > > +{ + static constexpr T value = 0; +}; + +template< typename T > +struct SIdentity< T, std::multiplies< T > > +{ + static constexpr T value = 1; +}; + +template< typename T > +struct SIdentity< T, std::logical_or< T > > +{ + static constexpr T value = false; +}; + +template< typename T > +struct SIdentity< T, std::logical_and< T > > +{ + static constexpr T value = true; +}; + +// Dummy struct to generate unique kernel name types +template< typename T, typename U, typename V > +struct SKernelNameType {}; + + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// Performs an inclusive scan with the given associative binary operation `OP` +// on the data in the `bufIn` buffer. Runs in parallel on the provided +// accelerated hardware queue. Modifies the input buffer to contain the +// results of the scan. +// Input size has to be a power of two. If the size isn't so, the input can +// easily be padded to the nearest power of two with any values, and the scan +// on the meaningful part of the data will stay the same. +template< typename T, typename OP > +void ParallelScan( sycl::buffer< T, 1 > &bufIn, sycl::queue &q ) +{ + // Retrieve the device associated with the given queue. + const sycl::device dev = q.get_device(); + const bool bHwIsCpu = dev.is_cpu(); + + // Check if local memory is available. On host no local memory is fine, since + if( !bHwIsCpu && + (dev.get_info< sycl::info::device::local_mem_type >() == + sycl::info::local_mem_type::none) ) + { + throw std::runtime_error( "Non host device does not have local memory." ); + } + + const size_t bufSize = bufIn.size(); + if( ((bufSize & (bufSize - 1)) != 0) || (bufSize == 0) ) + { + throw std::runtime_error( + "Given input buffer size is not a power of two." ); + } + + // Check if there is enough global memory. + const size_t globalMemSize = + dev.get_info< sycl::info::device::global_mem_size >(); + if( !bHwIsCpu && (bufSize > (globalMemSize * 0.5) ) ) + { + throw std::runtime_error( + "Non host device input size exceeds device global memory size." ); + } + + // Obtain device limits. + const size_t maxWgroupSize = + dev.get_info< sycl::info::device::max_work_group_size >(); + const size_t localMemSize = + dev.get_info< sycl::info::device::local_mem_size >(); + + // Find a work-group size that is guaranteed to fit in local memory and is + // below the maximum work-group size of the device. + const size_t wgroupSizeLim = + sycl::min( maxWgroupSize, localMemSize / (2 * sizeof( T )) ); + + // Every work-item processes two elements, so the work-group size has to + // divide this number evenly. + const size_t halfInBufSize = bufSize * 0.5; + + // Find the largest power of two that divides half_in_size and is within the + // device limit. + size_t wgroupSize = 0; + size_t pow = size_t( 1 ) << (sizeof( size_t ) * 8 - 1); + for( ; pow > 0; pow >>= 1 ) + { + if( (halfInBufSize / pow) * pow == + halfInBufSize && (pow <= wgroupSizeLim) ) + { + wgroupSize = pow; + break; + } + } + if( wgroupSize == 0 ) + { + throw std::runtime_error( + "Could not find an appropriate work-group size for the given input." ); + } + const size_t dblWgrpSize = wgroupSize * 2; + + q.submit( [&]( sycl::handler &cgh ) + { + const auto ptrData = + bufIn.template get_access< sycl::access::mode::read_write >( cgh ); + + // Using scratch/local memory (to a work group) for faster memory + // access to compute the results + sycl::accessor< T, 1, sycl::access::mode::read_write, + sycl::access::target::local > + scratch( wgroupSize * 2, cgh); + + // Use dummy struct as the unique kernel name. + cgh.parallel_for< SKernelNameType< T, OP, class CScanSegments > >( + sycl::nd_range< 1 >( halfInBufSize, wgroupSize ), + [=]( sycl::nd_item< 1 > item ) + { + const size_t gid = item.get_global_linear_id(); + const size_t lid = item.get_local_linear_id(); + + // Read data into local memory. + scratch[ 2 * lid ] = ptrData[ 2 * gid ]; + scratch[ 2 * lid + 1 ] = ptrData[ 2 * gid + 1 ]; + + // Preserve the second input element to add at the end. + const auto secondInput = scratch[ 2 * lid + 1 ]; + + // Perform partial reduction (up-sweep) on the data. The `off` + // variable is 2 to the power of the current depth of the + // reduction tree. In the paper, this corresponds to 2^d. + for( size_t off = 1; off < (wgroupSize * 2); off *= 2 ) + { + // Synchronize local memory to observe the previous writes. + item.barrier( sycl::access::fence_space::local_space ); + + const size_t i = lid * off * 2; + if( i < dblWgrpSize ) + { + const size_t index = i + (off * 2) - 1; + scratch[ index ] = + OP{}( scratch[ index ], scratch[ i + off - 1 ] ); + } + } + + // Clear the last element to the identity before down-sweeping. + if( lid == 0 ) + { + scratch[ dblWgrpSize - 1 ] = SIdentity< T, OP >::value; + } + + // Perform down-sweep on the tree to compute the whole scan. + // Again, `off` is 2^d. + for( size_t off = wgroupSize; off > 0; off >>= 1 ) + { + item.barrier( sycl::access::fence_space::local_space ); + + const size_t i = lid * off * 2; + if( i < dblWgrpSize ) + { + const size_t indexT = i + off - 1; + const size_t indexU = i + (off * 2) - 1; + const auto t = scratch[ indexT ]; + const auto u = scratch[ indexU ]; + scratch[ indexT ] = u; + scratch[ indexU ] = OP{}( t, u ); + } + } + + // Synchronize again to observe results. + item.barrier( sycl::access::fence_space::local_space ); + + // To return an inclusive rather than exclusive scan result, shift + // each element left by 1 when writing back into global memory. If + // we are the last work-item, also add on the final element. + const size_t indexL1 = 2 * lid + 1; + const size_t indexL2 = 2 * lid + 2; + const size_t indexG1 = 2 * gid; + const size_t indexG2 = 2 * gid + 1; + ptrData[ indexG1 ] = scratch[ indexL1 ]; + if( lid == wgroupSize - 1 ) + { + ptrData[ indexG2 ] = OP{}( scratch[ indexL1 ], secondInput ); + } + else + { + ptrData[ indexG2 ] = scratch[ indexL2 ]; + } + } // [=]( sycl::nd_item< 1 > item ) + ); // cgh.parallel_for< SKernelNameType< T, OP, class CScanSegments > >( + }); // q.submit( [&]( sycl::handler &cgh ) + + // At this point we have computed the inclusive scans of this many segments. + const size_t nSegments = halfInBufSize / wgroupSize; + + if( nSegments == 1 ) + { + // If all of the data is in one segment, we're done. + return; + } + // Otherwise we have to propagate the scan results forward into later + // segments. + + // Allocate space for one (last) element per segment. + sycl::buffer< T, 1 > bufEndSegment{ sycl::range< 1 >( nSegments ) }; + + // Store the elements in this space. + q.submit( [&](sycl::handler &cgh ) + { + const auto ptrScans = bufIn.template get_access< + sycl::access::mode::read >( cgh ); + const auto ptrElems = bufEndSegment.template get_access< + sycl::access::mode::discard_write >( cgh ); + + cgh.parallel_for< SKernelNameType< T, OP, class CCopyEndSeg > >( + sycl::range< 1 >( nSegments ), + [=]( sycl::item< 1 > item ) + { + const size_t id = item.get_linear_id(); + // Offset into the last element of each segment. + ptrElems[ item ] = ptrScans[ (id + 1) * 2 * wgroupSize - 1 ]; + }); + }); + + // Recursively scan the array of last elements. + ParallelScan< T, OP >( bufEndSegment, q ); + + // Add the results of the scan to each segment. + q.submit( [&]( sycl::handler &cgh ) + { + const auto ptrEndSegScan = bufEndSegment.template get_access< + sycl::access::mode::read >( cgh ); + const auto ptrDataIn = bufIn.template get_access< + sycl::access::mode::read_write >( cgh ); + + cgh.parallel_for< SKernelNameType< T, OP, class CAddEndSeg > >( + // Work with one less work-group, since the first segment is correct. + sycl::nd_range< 1 >( halfInBufSize - wgroupSize, wgroupSize ), + [=](sycl::nd_item< 1 > item) + { + const size_t grpLinId = item.get_group_linear_id(); + + // Start with the second segment. + const size_t glbIdOff = item.get_global_linear_id() + wgroupSize; + + // Each work-group adds the corresponding number in the + // "last element scan" array to every element in the group's + // segment. + ptrDataIn[ glbIdOff * 2 ] = OP{}( ptrDataIn[ glbIdOff * 2 ], + ptrEndSegScan[ grpLinId ] ); + ptrDataIn[ glbIdOff * 2 + 1 ] = OP{}( ptrDataIn[ glbIdOff * 2 + 1 ], + ptrEndSegScan[ grpLinId ] ); + }); + }); +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// Tests the scan with an addition operation, which is its most common use. +// Returns 0 if successful, a nonzero value otherwise. +int TestSum( sycl::queue &q ) +{ + constexpr size_t size = 64; + + // Initializes a vector of sequentially increasing values. + std::vector< int32_t > arrayIn( size ); + std::iota( arrayIn.begin(), arrayIn.end(), 1 ); + + // Compute the prefix sum using SYCL. + std::vector< int32_t > arraySum( arrayIn.size() ); + + { + // Read from `arrayIn`, but write into `arraySum`. + buffer< int32_t, 1 > bufArrayIn( sycl::range< 1 >( arrayIn.size() ) ); + bufArrayIn.set_final_data( arraySum.data() ); + + q.submit( [&](sycl::handler &cgh) + { + const auto acc = + bufArrayIn.get_access< sycl::access::mode::write >( cgh ); + cgh.copy( arrayIn.data(), acc ); + }); + + ParallelScan< int32_t, std::plus< int32_t > >( bufArrayIn, q ); + } + + // Compute the same operation using the standard library. + std::vector < int32_t > arrayTestSum( arrayIn.size() ); + std::partial_sum( arrayIn.begin(), arrayIn.end(), arrayTestSum.begin() ); + + cout << "\nSYCL compute's sum results:\n"; + for( auto a : arraySum ) + { + cout << a << " "; + } + cout << std::endl; + + // Check if the results are correct. + const bool bEqual = + std::equal( arraySum.begin(), arraySum.end(), arrayTestSum.begin() ); + if( !bEqual ) + { + cout << "SYCL sum computation incorrect!\n"; + cout << "std::partial_sum's results:\n"; + + for( auto a : arrayTestSum ) + { + cout << a << " "; + } + + return 1; + } + + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// Tests the scan with a multiply operation, which is a sequence of factorials. +// Returns 0 if successful, a nonzero value otherwise. +int TestFactorial( sycl::queue &q ) +{ + // Anything above this size overflows the int64_t type + constexpr size_t size = 16; + + // Initializes a vector of sequentially increasing values. + std::vector< int64_t > arrayIn( size ); + std::iota( arrayIn.begin(), arrayIn.end(), 1 ); + + // Compute a sequence of factorials using SYCL. + std::vector< int64_t > arrayFact( arrayIn.size() ); + { + // Read from `arrayIn`, but write into `arrayFact`. + sycl::buffer< int64_t, 1 > bufArrayIn( sycl::range< 1 >( arrayIn.size() )); + bufArrayIn.set_final_data( arrayFact.data() ); + q.submit( [&](sycl::handler &cgh ) + { + const auto acc = bufArrayIn.get_access< sycl::access::mode::write >( cgh ); + cgh.copy( arrayIn.data(), acc ); + }); + + ParallelScan< int64_t, std::multiplies< int64_t > >( bufArrayIn, q ); + } + + // Compute the same operation using the standard library. + std::vector< int64_t > arrayTestFact( arrayIn.size() ); + std::partial_sum( arrayIn.begin(), arrayIn.end(), arrayTestFact.begin(), + std::multiplies< int64_t >{} ); + + cout << "\nSYCL compute's factorial results:\n"; + for( auto a : arrayFact ) + { + cout << a << " "; + } + cout << std::endl; + + // Check if the results are correct. + const bool bEqual = std::equal( arrayFact.begin(), arrayFact.end(), + arrayTestFact.begin() ); + if( !bEqual ) + { + cout << "SYCL factorial computation incorrect!\n"; + cout << "std::partial_sum's results:\n"; + + for( auto a : arrayTestFact ) + { + cout << a << " "; + } + + return 1; + } + + return 0; +} \ No newline at end of file