diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000000..2b8e56ba4b
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+ "configurations": [
+ {
+ "name": "Linux",
+ "includePath": [
+ "${workspaceFolder}/**"
+ ],
+ "defines": [],
+ "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "compilerArgs": [ "-fsycl" ],
+ "cStandard": "gnu17",
+ "cppStandard": "gnu++17",
+ "intelliSenseMode": "linux-gcc-x64"
+ }
+ ],
+ "version": 4
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json
new file mode 100644
index 0000000000..df305ab886
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/launch.json
@@ -0,0 +1,35 @@
+{
+ "configurations": [
+ {
+ "name": "C/C++: Intel icpx build and debug gaussian_blur",
+ "type": "cppdbg",
+ "request": "launch",
+ "program": "${workspaceFolder}/bin/${config:programName}_d",
+ "args": ["${workspaceFolder}/bin/sample_image.jpg"],
+ "stopAtEntry": true,
+ "cwd": "${fileDirname}",
+ "environment": [],
+ "externalConsole": false,
+ "MIMode": "gdb",
+ "setupCommands": [
+ {
+ "description": "Enable pretty-printing for gdb",
+ "text": "-enable-pretty-printing",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Set Disassembly Flavor to Intel",
+ "text": "-gdb-set disassembly-flavor intel",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Needed by Intel oneAPI: Disable target async",
+ "text": "set target-async off",
+ "ignoreFailures": true
+ }
+ ],
+ "preLaunchTask": "gaussian_blur Debug C/C++: Intel icpx build active file",
+ "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json
new file mode 100644
index 0000000000..d7ab33f09f
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "programName": "gaussian_blur"
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json
new file mode 100644
index 0000000000..70aca5a0ce
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/.vscode/tasks.json
@@ -0,0 +1,49 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "type": "cppbuild",
+ "label": "gaussian_blur Debug C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-fdiagnostics-color=always",
+ "-fsycl-device-code-split=per_kernel",
+ "-fno-limit-debug-info",
+ "-g",
+ "-O0",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}_d"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx"
+ },
+ {
+ "type": "cppbuild",
+ "label": "gaussian_blur Release C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-DNDEBUG",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt
new file mode 100644
index 0000000000..9a59e896ed
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(WIN32)
+ set(CMAKE_CXX_COMPILER "dpcpp-cl")
+ set(CMAKE_C_COMPILER "dpcpp-cl")
+else()
+ set(CMAKE_CXX_COMPILER "icpx")
+endif()
+set(CMAKE_CXX_STANDARD 17)
+if(NOT DEFINED ${CMAKE_BUILD_TYPE})
+ set(CMAKE_BUILD_TYPE "RELEASE")
+endif()
+if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+cmake_minimum_required (VERSION 3.4)
+project (gaussian_blur)
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png
new file mode 100644
index 0000000000..6decbd0b6c
Binary files /dev/null and b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image-blurred.png differ
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg
new file mode 100644
index 0000000000..be810ad43e
Binary files /dev/null and b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/Images/sample_image.jpg differ
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md
new file mode 100644
index 0000000000..06f7782232
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/README.md
@@ -0,0 +1,101 @@
+# Image Gaussian Blur example program
+
+## Purpose
+This SYCL code example implements a Gaussian blur filter, blurring
+either a JPG or PNG image from the command line. The original file is not modified.
+The output file is in a PNG format.
+
+__Output Image:__
+
+
+
+
+## Prerequisites
+
+| Minimum Requirements | Description
+|:--- |:---
+| OS | Linux* Ubuntu* 20.04.5 LTS
+| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics
+| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite)
+| Libraries | Install Intel oneAPI Base Toolkit
+| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns, a .png capable image viewer
+
+## Build and Run using Visual Code Studio
+
+### Linux*
+
+Within a terminal window change directory to this project's folder. At the
+terminal prompt type:
+
+```
+cd ImageGuassianBlur
+code .
+```
+
+Visual Studio Code will open this project displaying its files in the Explorer
+pane.
+The project is already set up with build configurations to build either a
+debug build or a release build of the program. When a program is built, it is
+placed in the bin directory of this project's top folder.
+
+To build the program hit Ctrl+Shift+b and choose the type of program to build.
+The debug executable will have a '_d' appended to its name.
+
+To blur an image, copy the images/sample_image.jpg to the bin directory.
+To execute the program, type in the Visual Studio Code terminal window:
+```
+cd bin
+./gaussian_blur_d sample_image.jpg
+```
+A new image file will appear in the bin directory 'sample_image-blurred.png'.
+To view the image, select it in the directory folder app and hit return.
+Ubuntu will display the image using the preview app.
+
+## Build and Run using CMake
+### Linux*
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+To blur an image, copy the images/sample_image.jpg to the directory of the new
+executable. Type in the terminal window:
+
+```
+cd build/src
+./gaussian_blur sample_image.jpg
+```
+Open the resulting file: `sample_image-blurred.png` with an image viewer.
+
+## Debug the program using Visual Studio Code
+
+### Linux*
+
+Due to an issue with the image load library function stbi_load, make the
+directory bin (if it does not exist already) and copy the sample_image.jpg
+file into it. This will allow the program to find the file and continue the
+debug session.
+
+To debug the program, either choose from the IDE's run menu
+'Start debugging' or hit F5 on the keyboard.
+The debug launch.json configuration file defines the debug session to:
+* To halt the program at the first line of code after main().
+Use the GUI debug panel's buttons to step over code (key F10) lines to see the
+program advance.
+Breakpoints can be set either in the main code or the kernel code.
+
+Note: Setting breakpoints in the kernel code does not present the normal
+ step through code behavior. Instead a breakpoint event is occurring
+ on each thread being executed and so switches to the context of
+ that thread. To step through the code of a single thread, use the
+ Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the
+ IDE's debug console prompt. As this is not the main thread, be sure
+ to revert this setting on returning to debug any host side code.
+ Use the command 'set scheduler-locking replay' or 'off'.
+
+## License
+
+Code samples are licensed under the Apache 2.0 license. See
+[LICENSE.txt](LICENSE.txt) for details.
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt
new file mode 100644
index 0000000000..53d0c0958a
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl")
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+add_executable (gaussian_blur gaussian_blur.cpp)
+target_link_libraries(gaussian_blur OpenCL sycl)
+
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp
new file mode 100644
index 0000000000..8fd35018dd
--- /dev/null
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/GaussianImageFilter/src/gaussian_blur.cpp
@@ -0,0 +1,325 @@
+//============================================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// ===========================================================================
+
+//****************************************************************************
+//
+// Description:
+// This advanced SYCL code example implements a Gaussian blur filter, blurring
+// a JPG or PNG image from the command line. The original file is not modified.
+// The output file is a PNG image.
+//
+// Usage:
+// The program blurs an image provided on the command line.
+//
+//*****************************************************************************
+
+// SYCL or oneAPI toolkit headers:
+#include
+
+// Third party headers:
+#include
+#include
+// These public domain headers implement useful image reading and writing
+// functions. Find in ${oneAPI}/dev-utilities/include
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb/stb_image.h"
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb/stb_image_write.h"
+
+// Forward declaration of this example's SYCL kernels
+class KernelFillGaussian;
+class KernelGaussian;
+
+using namespace sycl;
+using namespace std;
+
+// Attempts to determine a good local size. The best way to *control*
+// performance is to choose the sizes. The method here is to choose the
+// largest number, leq 64, which is a power-of-two, and divides the global
+// work size evenly. In this code, it might prove most optimal to pad the
+// image along one dimension so that the local size could be 64, but this
+// introduces other complexities.
+range< 2 > GetOptimalLocalRange( range< 2 > globalSize, device hw )
+{
+ range< 2 > optimalLocalSize{ 0, 0 };
+
+ // 64 is a good local size on GPU-like devices, as each compute unit is
+ // made of many smaller processors. On non-GPU devices, 4 is a common vector
+ // width.
+ if( hw.is_gpu() )
+ {
+ optimalLocalSize = range< 2 >( 64, 1 );
+ }
+ else
+ {
+ optimalLocalSize = range< 2 >( 4, 1 );
+ }
+
+ // Here, for each dimension, we make sure that it divides the global size
+ // evenly. If it doesn't, we try the next lowest power of two. Eventually
+ // it will reach one, if the global size has no power of two component.
+ for( int i = 0; i < 2; ++i )
+ {
+ while( globalSize[ i ] % optimalLocalSize[ i ] != 0 )
+ {
+ optimalLocalSize[ i ] = optimalLocalSize[ i ] >> 1;
+ }
+ }
+
+ return optimalLocalSize;
+}
+
+// Asynchronous errors hander, catch faults in asynchronously executed code
+// inside a command group or a kernel. They can occur in a different stackframe,
+// asynchronous error cannot be propagated up the stack.
+// By default, they are considered 'lost'. The way in which we can retrieve them
+// is by providing an error handler function.
+auto exception_handler = []( sycl::exception_list exceptions )
+{
+ for( std::exception_ptr const &e : exceptions )
+ {
+ try
+ {
+ std::rethrow_exception( e );
+ }
+ catch( sycl::exception const &e )
+ {
+ std::cout <<
+ "Queue handler caught asynchronous SYCL exception:\n" <<
+ e.what() << std::endl;
+ }
+ }
+};
+
+// The Gaussian program
+int main( int argc, char* argv[] )
+{
+ bool bProgramError = false;
+
+ // Validate user input
+ if( argc < 2 )
+ {
+ std::cout
+ << "Please provide a JPEG or PNG image as an argument to this program."
+ << std::endl;
+ }
+
+ // ********************
+ // Input image handling
+ // ********************
+ // The image dimensions will be set by the library, as will the number of
+ // channels. However, passing a number of channels will force the image
+ // data to be returned in that format, regardless of what the original image
+ // looked like. The header has a mapping from int values to types - 4 means
+ // RGBA.
+ int inputWidth = 0;
+ int inputHeight = 0;
+ int inputChannels = 0;
+
+ // Number of color channels RGBA// Project files:
+ const int numChannels = 4;
+ const char *pImageFileName = argv[ 1 ];
+ unique_ptr< unsigned char [] > pInputImg( stbi_load( pImageFileName,
+ &inputWidth, &inputHeight, &inputChannels, numChannels ) );
+ if( pInputImg == nullptr )
+ {
+ bProgramError = true;
+ std::cout << "Failed to load image file (is argv[1] a valid image file?)"
+ << std::endl;
+ exit(-1);
+ }
+
+ // RAII resource
+ unique_ptr< unsigned char [] > pOutputImg(
+ new unsigned char[ inputWidth * inputHeight * numChannels ] );
+
+ try
+ {
+ sycl::device hw = device( sycl::cpu_selector_v );
+ queue myQueue( hw, exception_handler );
+
+ // *******************************************
+ // Create gaussian convolution matrix and fill
+ // *******************************************
+ const float pi = std::atan( 1 ) * 4;
+ constexpr auto guasStdDev = 2;
+ constexpr auto guasDelta = 6;
+ const int guasMatrixRange = (guasDelta * guasStdDev);
+ const float guasStdDevFactor = 2 * guasStdDev * guasStdDev;
+ const float piFactor = guasStdDevFactor * pi;
+ const int gaussianBlurRange = guasMatrixRange * guasMatrixRange;
+ vector< float > gaussianBlurMatrix( gaussianBlurRange );
+
+ // The nd_range contains the total work (as mentioned previously) as
+ // well as the local work size (i.e. the number of threads in the local
+ // group). Here, we attempt to find a range close to the device's
+ // preferred size that also divides the global size neatly.
+ auto optRange = GetOptimalLocalRange(
+ range< 2 >{ guasMatrixRange, guasMatrixRange }, myQueue.get_device() );
+ const nd_range< 2 > gaussianBlurNDRange(
+ range< 2 >{ guasMatrixRange, guasMatrixRange }, optRange );
+ buffer bufGaussian( gaussianBlurMatrix );
+
+ // Enqueue KernelFillGaussian
+ myQueue.submit( [&]( handler &cgh )
+ {
+ const auto ptrGBlur =
+ bufGaussian.get_access< access::mode::discard_write >( cgh );
+ cgh.parallel_for< KernelFillGaussian >( gaussianBlurNDRange,
+ [=]( nd_item< 2 > item )
+ {
+ // Get the 2D x and y indicies
+ const auto idX = item.get_global_id( 0 );
+ const auto idY = item.get_global_id( 1 );
+ const auto width = item.get_group_range( 0 ) *
+ item.get_local_range( 0 );
+ const auto index = idX * width + idY;
+ const auto x = idX - guasDelta;
+ const auto y = idY - guasDelta;
+ float gausVallue = sycl::exp( -1.0f * (x*x + y*y) / guasStdDevFactor );
+ gausVallue /= piFactor;
+ ptrGBlur[ index ] = gausVallue;
+ });
+ });
+
+ // ********************************************************
+ // Using gaussian convolution matrix, blur the input image.
+ // ********************************************************
+
+ // Images need a void * pointing to the data, and enums describing the
+ // type of the image (since a void * carries no type information). It
+ // also needs a range which describes the image's dimensions.
+ using co = sycl::image_channel_order;
+ using ct = sycl::image_channel_type;
+ // The image data has been returned us an unsigned char [], but due to
+ // OpenCL restrictions, we must use it as a void *.
+ void *pInputData = (void *) pInputImg.get();
+ void *pOutputData = (void *) pOutputImg.get();
+ // This range represents the full amount of work to be done across the
+ // image. We dispatch one thread per pixel.
+ range< 2 > imgRange( inputWidth, inputHeight );
+ image< 2 > imageIn( pInputData, co::rgba, ct::unorm_int8, imgRange );
+ image< 2 > imageOut( pOutputData, co::rgba, ct::unorm_int8, imgRange );
+ optRange = GetOptimalLocalRange( imgRange, myQueue.get_device() );
+ auto myRange = nd_range< 2 >( imgRange, optRange );
+ constexpr auto offset = guasDelta;
+
+ // Enqueue KernelGaussian
+ // Because of the dependency on the gaussian convolution grid, the call
+ // graph will automatically schedule this kernel to run after the
+ // KernelFillGaussian is complete.
+ myQueue.submit( [&]( handler &cgh )
+ {
+ // Images still require accessors, like buffers, except the target is
+ // always access::target::image.
+ accessor< float4, 2, access::mode::read, access::target::image >
+ accImgInPtr( imageIn, cgh );
+ accessor< float4, 2, access::mode::discard_write, access::target::image >
+ accImgOutPtr( imageOut, cgh );
+ const auto ptrGBlur =
+ bufGaussian.get_access< access::mode::read >( cgh );
+
+ // The sampler is used to map user-provided co-ordinates to pixels in
+ // the image.
+ sampler smpl( coordinate_normalization_mode::unnormalized,
+ addressing_mode::none, filtering_mode::nearest );
+
+ // Setting breakpoints in the kernel code does not present the normal
+ // step through code behavior. Instead a breakpoint event is occurring
+ // on each thread being executed and so switches to the context of
+ // that thread. To step through the code of a single thread, use the
+ // Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the
+ // IDE's debug console prompt. As this is not the main thread, be sure
+ // to revert this setting on returning to debug any host side code.
+ // Use the command 'set scheduler-locking replay' or 'off'.
+ cgh.parallel_for< KernelGaussian >( myRange, [=](nd_item< 2 > item)
+ {
+ const auto idY = item.get_global_id( 1 );
+ const auto idX = item.get_global_id( 0 );
+ const auto outputCoords = int2( idX, idY );
+ // A boundary is used so the convolution grid does not fall off the
+ // sides of the image. Keep it simple, just copy those pixels at the
+ // edges of the image.
+ const int hitY1 = idY - offset;
+ const int hitY2 = inputHeight - idY - offset;
+ const int hitX1 = idX - offset;
+ const int hitX2 = inputWidth - idX - offset;
+ const bool bBoundryY = (hitY1 < 0) || (hitY2 < 0);
+ const bool bBoundryX = (hitX1 < 0) || (hitX2 < 0);
+ float4 newPixel = float4( 0.0f, 0.0f, 0.0f, 0.0f );
+
+ if( !(bBoundryX || bBoundryY) )
+ {
+ // Perform a convolution on a central pixel at idX idY
+ for( int x = 0; x < guasMatrixRange; x++ )
+ {
+ for( int y = 0; y < guasMatrixRange; y++ )
+ {
+ const auto index = x * guasMatrixRange + y;
+ const float value = ptrGBlur[ index ];
+ const auto inputCoords =
+ int2( idX + x - offset, idY + y - offset );
+ newPixel += accImgInPtr.read( inputCoords, smpl ) * value;
+ }
+ }
+ }
+ else
+ {
+ // Just duplicate the pixel at idX idY
+ const auto inputCoords = int2( idX, idY );
+ newPixel = accImgInPtr.read( inputCoords, smpl );
+ }
+ newPixel.w() = 1.0f;
+ accImgOutPtr.write( outputCoords, newPixel );
+
+ });
+ });
+ // The host/main thread is asked to wait here until all enqueued kernels
+ // have completed execution.
+ myQueue.wait_and_throw();
+ }
+ // Synchronous errors are classical C++ exceptions
+ catch( sycl::exception const &e )
+ {
+
+ bProgramError = true;
+ cout <<
+ "Wrap catch caught synchronous SYCL exception:\n" << e.what() << std::endl;
+ }
+
+ if( bProgramError )
+ {
+ std::cout << "Program failed." << std::endl;
+ return -1;
+ }
+
+ // ****************************
+ // Output the new blurred image
+ // ****************************
+
+ // Attempt to change the name from x.png or x.jpg to x-blurred.png.
+ // If the code cannot find a '.', it simply appends "-blurred" to the name.
+ std::string outputFilePath;
+ std::string inputName( argv[ 1 ] );
+ auto pos = inputName.find_last_of( "." );
+ if( pos == std::string::npos )
+ {
+ outputFilePath = inputName + "-blurred";
+ }
+ else
+ {
+ inputName.erase( pos, inputName.size() );
+ outputFilePath = inputName + "-blurred" + ".png";
+ }
+
+ stbi_write_png( outputFilePath.c_str(), inputWidth, inputHeight, numChannels,
+ pOutputImg.get(), 0 );
+
+ std::cout <<
+ "Program success, the image is successfully blurred!" << std::endl;
+
+ return 0;
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000000..2b8e56ba4b
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+ "configurations": [
+ {
+ "name": "Linux",
+ "includePath": [
+ "${workspaceFolder}/**"
+ ],
+ "defines": [],
+ "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "compilerArgs": [ "-fsycl" ],
+ "cStandard": "gnu17",
+ "cppStandard": "gnu++17",
+ "intelliSenseMode": "linux-gcc-x64"
+ }
+ ],
+ "version": 4
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json
new file mode 100644
index 0000000000..6d3bc25b7d
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/launch.json
@@ -0,0 +1,48 @@
+{
+ "configurations": [
+ {
+ "name": "C/C++: Intel icpx build and debug MonteCarloPi",
+ "type": "cppdbg",
+ "request": "launch",
+ "program": "${workspaceFolder}/bin/${config:programName}_d",
+ "args": ["${input:args}"],
+ "stopAtEntry": true,
+ "cwd": "${fileDirname}",
+ "environment": [],
+ "externalConsole": false,
+ "MIMode": "gdb",
+ "setupCommands": [
+ {
+ "description": "Enable pretty-printing for gdb",
+ "text": "-enable-pretty-printing",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Set Disassembly Flavor to Intel",
+ "text": "-gdb-set disassembly-flavor intel",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Needed by Intel oneAPI: Disable target async",
+ "text": "set target-async off",
+ "ignoreFailures": true
+ }
+ ],
+ "preLaunchTask": "MonteCarloPi Debug C/C++: Intel icpx build active file",
+ "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi"
+ }
+ ],
+ "inputs" : [
+ {
+ "id": "args",
+ "type": "pickString",
+ "description": "Program args",
+ "default": "cpu",
+ "options": [
+ "cpu",
+ "gpu",
+ "accelerator"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json
new file mode 100644
index 0000000000..a2e868b496
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "programName": "MonteCarloPi"
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json
new file mode 100644
index 0000000000..4383e21898
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/.vscode/tasks.json
@@ -0,0 +1,48 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "type": "cppbuild",
+ "label": "MonteCarloPi Debug C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-fdiagnostics-color=always",
+ "-fno-limit-debug-info",
+ "-g",
+ "-O0",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}_d"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx"
+ },
+ {
+ "type": "cppbuild",
+ "label": "MonteCarloPi Release C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-DNDEBUG",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt
new file mode 100644
index 0000000000..482e1862b1
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(WIN32)
+ set(CMAKE_CXX_COMPILER "dpcpp-cl")
+ set(CMAKE_C_COMPILER "dpcpp-cl")
+else()
+ set(CMAKE_CXX_COMPILER "icpx")
+endif()
+set(CMAKE_CXX_STANDARD 17)
+if(NOT DEFINED ${CMAKE_BUILD_TYPE})
+ set(CMAKE_BUILD_TYPE "RELEASE")
+endif()
+if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+cmake_minimum_required (VERSION 3.4)
+project (MonteCarloPi)
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md
new file mode 100644
index 0000000000..9b5ac8f04b
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/README.md
@@ -0,0 +1,85 @@
+# Monte Carlo Pi example program
+
+## Purpose
+Monte Carlo Simulation is a broad category of computation that utilizes
+statistical analysis to reach a result. This `Monte Carlo Pi` sample uses the
+Monte Carlo Procedure to estimate the value of pi.
+
+## Prerequisites
+
+| Minimum Requirements | Description
+|:--- |:---
+| OS | Linux* Ubuntu* 20.04.5 LTS
+| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics
+| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite)
+| Libraries | Install Intel oneAPI Base Toolkit
+| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns
+
+## Build and Run using Visual Code Studio
+
+### Linux*
+
+Within a terminal window change directory to this project's folder. At the
+terminal prompt type:
+
+```
+cd MonteCarloPi_v2
+code .
+```
+
+Visual Studio Code will open this project displaying its files in the Explorer
+pane.
+The project is already set up with build configurations to build either a
+debug build or a release build of the program. When a program is built, it is
+placed in the bin directory of this project's top folder.
+
+To build the program hit Ctrl+Shift+b and choose the type of program to build.
+The debug executable will have a '_d' appended to its name.
+
+To execute the program, type in the Visual Studio Code terminal window:
+```
+cd bin
+./MonteCarloPi_d cpu
+```
+
+## Build and Run using CMake
+### Linux*
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+To execute the program type in the terminal window:
+
+```
+cd build/src
+./MonteCarloPi cpu
+```
+
+## Debug the program using Visual Studio Code
+
+### Linux*
+
+To debug the program, either choose from the IDE's run menu
+'Start debugging' or hit F5 on the keyboard.
+The debug launch.json configuration file defines the debug session to:
+* To halt the program at the first line of code after main().
+Use the GUI debug panel's buttons to step over code (key F10) lines to see the
+program advance.
+Breakpoints can be set either in the main code or the kernel code.
+
+Note: Setting breakpoints in the kernel code does not present the normal
+ step through code behavior. Instead a breakpoint event is occurring
+ on each thread being executed and so switches to the context of
+ that thread. To step through the code of a single thread, use the
+ Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the
+ IDE's debug console prompt. As this is not the main thread, be sure
+ to revert this setting on returning to debug any host side code.
+ Use the command 'set scheduler-locking replay' or 'off'.
+
+## License
+
+Code samples are licensed under the Apache 2.0 license. See
+[LICENSE.txt](LICENSE.txt) for details.
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt
new file mode 100644
index 0000000000..a40a39873d
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl")
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+add_executable (MonteCarloPi MonteCarloPi.cpp)
+target_link_libraries(MonteCarloPi OpenCL sycl)
+
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp
new file mode 100644
index 0000000000..e8553405d3
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/MonteCarloPi.cpp
@@ -0,0 +1,338 @@
+//============================================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// ===========================================================================
+
+//****************************************************************************
+//
+// Description:
+// Example of Monte-Carlo Pi approximation algorithm in SYCL. Also,
+// demonstrating how to query the maximum number of work-items in a
+// work-group to check if a kernel can be executed with the initially
+// desired work-group size.
+//
+// Usage:
+// The program takes one argument: host / cpu / gpu / accelerator.
+//
+//*****************************************************************************
+
+// SYCL or oneAPI toolkit headers:
+#include
+
+// Third party headers:
+#include
+#include
+#include
+#include
+#include
+
+// In-house headers:
+#include "device_selector.hpp"
+
+using namespace std;
+using namespace sycl;
+
+// Forward declerations:
+size_t GetBestWorkGroupSize( size_t work_group_size,
+ const sycl::device &device,
+ const sycl::kernel &kernel );
+
+// Monte-Carlo Pi SYCL C++ functor
+class CMonteCarloPiKernel
+{
+ template< typename dataT >
+ using readGlobalAccessor = sycl::accessor<
+ dataT, 1,
+ sycl::access::mode::read,
+ sycl::access::target::global_buffer >;
+ template < typename dataT >
+ using writeGlobalAccessor = sycl::accessor<
+ dataT, 1,
+ sycl::access::mode::write,
+ sycl::access::target::global_buffer >;
+ template< typename dataT >
+ using readWriteLocalAccessor = sycl::accessor<
+ dataT, 1,
+ sycl::access::mode::read_write,
+ sycl::access::target::local >;
+ public:
+ CMonteCarloPiKernel( readGlobalAccessor< sycl::cl_float2 > ptrPoints,
+ writeGlobalAccessor< sycl::cl_int > ptrResults,
+ readWriteLocalAccessor< sycl::cl_int > ptrResultsLocal )
+ : m_ptrPoints( ptrPoints ),
+ m_ptrResults( ptrResults ),
+ m_ptrResultsLocal( ptrResultsLocal )
+ {}
+
+ // Functor kernel using a 1D ND-range of work items
+ void operator()( sycl::nd_item< 1 > item ) const
+ {
+ // Setting breakpoints in the kernel code does not present the normal
+ // step through code behavior. Instead a breakpoint event is occurring
+ // on each thread being executed and so switches to the context of
+ // that thread. To step through the code of a single thread, use the
+ // gdb-oneapi command 'set scheduler-locking step' or 'on' in the
+ // IDE's debug console prompt. As this is not the main thread, be sure
+ // to revert this setting on returning to debug any host side code.
+ // Use the command 'set scheduler-locking replay' or 'off'.
+
+ const size_t idGlobal = item.get_global_id( 0 );
+ const size_t idLocal = item.get_local_id( 0 );
+ const size_t localDim = item.get_local_range( 0 );
+ const size_t idGroup = item.get_group( 0 );
+
+ // Get the point to work on
+ const sycl::float2 point = m_ptrPoints[ idGlobal ];
+
+ // Calculate the length - built-in SYCL function
+ // length: sqrt(point.x * point.x + point.y * point.y)
+ const float len = sycl::length( point );
+
+ // Result is either 1 or 0
+ m_ptrResultsLocal[ idLocal ] = (len <= 1.0f) ? 1 : 0;
+
+ // Wait for the entire work group to get here.
+ item.barrier( sycl::access::fence_space::local_space );
+
+ // If work item 0 in work group, sum local values
+ if( idLocal == 0 )
+ {
+ int sum = 0;
+ for( size_t i = 0; i < localDim; i++ )
+ {
+ if( m_ptrResultsLocal[ i ] == 1 )
+ {
+ ++sum;
+ }
+ }
+
+ // Store the sum in global memory
+ m_ptrResults[ idGroup ] = sum;
+ }
+ }
+
+ private:
+ readGlobalAccessor< sycl::cl_float2 > m_ptrPoints;
+ writeGlobalAccessor< sycl::cl_int > m_ptrResults;
+ readWriteLocalAccessor< sycl::cl_int > m_ptrResultsLocal;
+};
+
+
+// Asynchronous errors hander, catch faults in asynchronously executed code
+// inside a command group or a kernel. They can occur in a different stackframe,
+// asynchronous error cannot be propagated up the stack.
+// By default, they are considered 'lost'. The way in which we can retrieve them
+// is by providing an error handler function.
+auto exception_handler = []( sycl::exception_list exceptions )
+{
+ for( std::exception_ptr const &e : exceptions )
+ {
+ try
+ {
+ std::rethrow_exception( e );
+ }
+ catch( sycl::exception const &e )
+ {
+ std::cout << "Queue handler caught asynchronous SYCL exception:\n"
+ << e.what() << std::endl;
+ }
+ }
+};
+
+// The Monto Carlo Pi program
+int main( int argc, char *argv[] )
+{
+ CUtilDeviceTargets utilsDev;
+ FnResult fnResult = utilsDev.DiscoverDevsWeWant();
+ if( !fnResult.bSuccess )
+ {
+ cerr << "Program failure: Unable to discover target devices on this platform.\n";
+ exit( -1 );
+ }
+
+ fnResult = UserCheckTheirInput( utilsDev, argc, argv );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( 1 );
+ }
+
+ bool bDoDevDiscovery = false;
+ fnResult = UserWantsToDiscoverPossibleTargets( argv, bDoDevDiscovery );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( -1 );
+ }
+ if( bDoDevDiscovery ) exit( 1 );
+
+ const SDeviceFoundProxy *pUsersChosenDevice = utilsDev.GetDevUsersFirstChoice();
+ if( pUsersChosenDevice == nullptr )
+ {
+ cerr << "Program failure: Did not create a valid target device object.\n";
+ exit( -1 );
+ }
+
+ constexpr size_t iterations = 1 << 20;
+ size_t workGroupSize = 1 << 10;
+
+ // Container for the sum calculated per each work-group.
+ std::vector< sycl::cl_int > arrayResults;
+
+ // Generate random points on the host - one point for each work item (thread)
+ std::vector< sycl::float2 > arrayPoints( iterations );
+ // Fill up with (pseudo) random values in the range: [0, 1]
+ std::random_device r;
+ std::default_random_engine e( r() );
+ std::uniform_real_distribution< float > dist;
+ std::generate( arrayPoints.begin(), arrayPoints.end(),
+ [&r, &e, &dist]()
+ {
+ return sycl::float2( dist( e ), dist( e ) );
+ });
+
+ try
+ {
+ // Create a SYCL queue
+ queue queue( pUsersChosenDevice->theDevice, exception_handler );
+
+ string strTheDeviceBeingUsed;
+ fnResult = CUtilDeviceTargets::GetQueuesCurrentDevice( queue, strTheDeviceBeingUsed );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( -1 );
+ }
+ cout << strTheDeviceBeingUsed << "\n";
+
+ // Get device and display information: name and platform
+ const sycl::device hw = queue.get_device();
+ cout << "Selected " << hw.get_info< sycl::info::device::name >()
+ << " on platform "
+ << hw.get_info< sycl::info::device::platform >()
+ .get_info< sycl::info::platform::name >()
+ << std::endl;
+
+ // Force online compilation of all kernels in the hwCntext now,
+ // unless already compiled for the device ahead-of-time.
+ const auto hwContext = queue.get_context();
+ const sycl::kernel_id kernelID =
+ sycl::get_kernel_id< CMonteCarloPiKernel >();
+ const auto hwKernelBundle =
+ sycl::get_kernel_bundle< sycl::bundle_state::executable >( hwContext );
+ const sycl::kernel kernel = hwKernelBundle.get_kernel( kernelID );
+
+ // If the desired work-group size doesn't satisfy the device, define a
+ // perfect/max work-group depending on the selected device and kernel
+ // maximum size allowance.
+ workGroupSize = GetBestWorkGroupSize( workGroupSize, hw, kernel );
+
+ // Size of the total sums that are going to be stored in the results vector
+ // is set based on the defined work-group size.
+ arrayResults.resize( iterations / workGroupSize );
+
+ // Allocate device memory
+ sycl::buffer< sycl::cl_float2 > buffPoints( arrayPoints.data(),
+ sycl::range<1>( iterations ) );
+ sycl::buffer< sycl::cl_int > buffResults( arrayResults.data(),
+ sycl::range< 1 >( iterations / workGroupSize ) );
+
+ queue.submit( [&](sycl::handler& cgh)
+ {
+ const size_t global_size = iterations;
+ const size_t local_size = workGroupSize;
+
+ // Get access to the data (points and results) on the device
+ const auto ptrPoints =
+ buffPoints.get_access( cgh );
+ const auto ptrResults =
+ buffResults.get_access< sycl::access::mode::write >( cgh );
+
+ // Allocate local memory on the device (to compute results)
+ const sycl::accessor< sycl::cl_int, 1, sycl::access::mode::read_write,
+ sycl::access::target::local >
+ ptrResultsLocal( sycl::range< 1 >( local_size ), cgh );
+
+ // Run the kernel
+ cgh.parallel_for(
+ sycl::nd_range< 1 >( sycl::range< 1 >( global_size ),
+ sycl::range< 1 >( local_size ) ),
+ CMonteCarloPiKernel( ptrPoints, ptrResults, ptrResultsLocal ) );
+ });
+ }
+ catch( const sycl::exception &e )
+ {
+ std::cerr << "SYCL exception caught: " << e.what() << std::endl;
+ return 1;
+ }
+ catch( const std::exception &e )
+ {
+ std::cerr << "C++ exception caught: " << e.what() << std::endl;
+ return 2;
+ }
+
+ // Sum the results (auto copied back to host)
+ int inCircle = 0;
+ for( int &result : arrayResults )
+ {
+ inCircle += result;
+ }
+
+ // Calculate the final result of "pi"
+ float pi = (4.0f * inCircle) / iterations;
+ std::cout << "pi = " << pi << std::endl;
+
+ return 0;
+}
+
+
+// A helper to define a "perfect" work-group size dependant on selected device
+// and kernel maximum allowance.
+size_t GetBestWorkGroupSize( const size_t workGroupSize,
+ const sycl::device &device,
+ const sycl::kernel &kernel )
+{
+ if( device.is_cpu() )
+ {
+ const size_t maxDeviceWorkGroupSize =
+ device.get_info< sycl::info::device::max_work_group_size >();
+
+ // Check if the desired work-group size will be allowed on the host device
+ // and query the maximum possible size on that device in case the desired
+ // one is more than the allowed.
+ if( workGroupSize > maxDeviceWorkGroupSize )
+ {
+ cout << "Maximum work-group size for device "
+ << device.get_info< sycl::info::device::name >() << ": "
+ << maxDeviceWorkGroupSize << std::endl;
+
+ return maxDeviceWorkGroupSize;
+ }
+
+ return workGroupSize;
+ }
+ else
+ {
+ const size_t maxKernelWorkGroupSize = kernel.get_info<
+ sycl::info::kernel_device_specific::work_group_size >( device );
+
+ // Verify if the kernel can be executed with our desired work-group size,
+ // and if it can't use the maximum allowed kernel work-group size for the
+ // selected device.
+ if( workGroupSize > maxKernelWorkGroupSize )
+ {
+ cout << "Maximum work-group size for "
+ << typeid( CMonteCarloPiKernel ).name() << " on device "
+ << device.get_info() << ": "
+ << maxKernelWorkGroupSize << "\n";
+
+ return maxKernelWorkGroupSize;
+ }
+
+ // Otherwise, the work-size will stay the originally desired one
+ return workGroupSize;
+ }
+}
+
diff --git a/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp
new file mode 100644
index 0000000000..a515c547be
--- /dev/null
+++ b/DirectProgramming/DPC++/MapReduce/MonteCarloPi_v2/src/device_selector.hpp
@@ -0,0 +1,421 @@
+//==============================================================
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#pragma once
+
+// SYCL or oneAPI toolkit headers:
+#include
+
+// Third party headers:
+#include
+
+using namespace std;
+using namespace sycl;
+
+//++
+//============================================================================
+// Details: Common code utility. User defined enumerate of the SYCL device
+// target typesdesired.
+// An 'eHost' is admitted because SYCL 2020 depricates host device
+// selection.
+//--
+enum EDevsWeWant
+{
+ eNotValid = 0, // Default
+ eCPU = 1,
+ eGPU = 2,
+ eAccelerator = 3, // i.e. a FPGA type device
+ eCount = 4 // Always the last one
+};
+
+//++
+//============================================================================
+// Details: Common code utility. User defined target device proxy.
+// After the utility has discoverd available devices on the system,
+// this structure holds/caches information about the device.
+// Forms a proxy device object representing an actual possible target
+// device found on the system.
+//--
+struct SDeviceFoundProxy final
+{
+ EDevsWeWant eDevice = eNotValid; // The type of real device we want to
+ // use to run kernels on.
+ string strDeviceName = ""; // THe proxy label (ID) for a real
+ // device.
+ bool bAvailable = false; // True = can be used,
+ // False = not found on the system.
+ bool bActiveTarget = false; // True = use it, false = stop using.
+ int nScore = 0; // User defined score of the device.
+ sycl::device theDevice; // Copy of the real device found.
+};
+
+//++
+//============================================================================
+// Details: Common code utility. Rudimentry error reporting system. Used by
+// utility class to explicity aid the user or the programmer of any
+// issues that have occurred.
+//--
+struct FnResult final
+{
+ bool bSuccess = true;
+ string strErrMsg = "";
+};
+
+//++
+//============================================================================
+// Details: Common code utility. A basic utility class to wrap up functions
+// that can discover, then target the acceleration devices found on
+// a system.
+//
+// Docs: https://www.intel.com/content/www/us/en/developer/articles/
+// technical/device-discovery-with-sycl.html#gs.nhyd7s
+// https://registry.khronos.org/SYCL/specs/sycl-2020/html/
+// sycl-2020.html#sec:device-selection
+//
+// It can find all the available device targets on a system.
+// But it will only store a list of the first device found of the following
+// type and criteria: cpu, gpu and accelerator.
+// Each device in the list can be set to be an active target.
+// The last or latest call to function SetDevToActive() will change the
+// device proxy object returned by GetDevUsersFirstChoice() to be that device.
+// --
+class CUtilDeviceTargets final
+{
+ // Definitions:
+ public:
+ typedef std::vector< SDeviceFoundProxy > ListDevicesFound_t;
+
+ // Static method:
+ public:
+ static FnResult DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices );
+ static const string& GetInputOptionDiscoverDevice();
+ static FnResult GetQueuesCurrentDevice( const queue &vrQ, string &vrstr );
+
+ // Methods:
+ public:
+ CUtilDeviceTargets();
+ ~CUtilDeviceTargets();
+
+ FnResult DiscoverDevsWeWant();
+ const ListDevicesFound_t &GetListDevs() const;
+ const SDeviceFoundProxy *GetDevUsersFirstChoice() const;
+ FnResult SetDevToActive( const string &rvDeviceName, const bool vbActive );
+
+ // Attributes:
+ private:
+ ListDevicesFound_t m_listDeviceTargets;
+ static string m_strDiscoverDeviceInputOption;
+ SDeviceFoundProxy *m_pDeviceUserFirstChoice; // NULL = a choice has not been made
+};
+
+// Instantiations:
+string CUtilDeviceTargets::m_strDiscoverDeviceInputOption = "discover_devices";
+
+//++
+// Details: CUtilDeviceTargets constructor.
+// Type: Method.
+// Args: None.
+// Return: None.
+// Throws: None.
+//--
+CUtilDeviceTargets::CUtilDeviceTargets()
+: m_pDeviceUserFirstChoice( nullptr )
+{}
+
+//++
+// Details: CUtilDeviceTargets destructor.
+// Type: Method.
+// Args: None.
+// Return: None.
+// Throws: None.
+//--
+CUtilDeviceTargets::~CUtilDeviceTargets()
+{
+ // Release
+ m_pDeviceUserFirstChoice = nullptr;
+}
+
+//++
+// Details: Return a report on the specified SYCL queue stating its current
+// real target device and the device platform.
+//
+// Type: Method.
+// Args: vrQ - (R) The queue to query.
+// vrstrReport - (W) The report text.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::GetQueuesCurrentDevice( const queue &vrQ, string &vrstrReport )
+{
+ FnResult status;
+
+ vrstrReport = "[SYCL] Using device: [";
+ vrstrReport += vrQ.get_device().get_info< info::device::name >();
+ vrstrReport += "] from [";
+ vrstrReport += vrQ.get_device().get_platform().get_info< info::platform::name >();
+ vrstrReport += "]";
+
+ return status;
+}
+
+//++
+// Details: Returns the program's text label for the user input option
+// to choose to discover all the available device on a system.
+// Type: Method.
+// Args: None.
+// Return: string& - Text label.
+// Throws: None.
+//--
+const string& CUtilDeviceTargets::GetInputOptionDiscoverDevice()
+{
+ return m_strDiscoverDeviceInputOption;
+}
+
+//++
+// Details: Returns the pointer to the proxy object in the list of
+// discovery device proxies.
+// Type: Method.
+// Args: None.
+// Return: SDeviceFoundProxy* - pointer to object.
+// Throws: None.
+//--
+const SDeviceFoundProxy * CUtilDeviceTargets::GetDevUsersFirstChoice() const
+{
+ return m_pDeviceUserFirstChoice;
+}
+
+//++
+// Details: A pointer to the current user's choice of target is made on this
+// function being successful.
+//
+// If this function fails, the pointer retains the last or
+// remains NULL (a choice was never made).
+//
+// Type: Method.
+// Args: string &rvDeviceName - (R) Proxy's label or ID text.
+// bool vbActive - (R) True = Use the device, False = disable use.
+// Return: FnResult - Status of the function's operational success.
+// Throws: None.
+//--
+FnResult CUtilDeviceTargets::SetDevToActive( const string &rvDeviceName, const bool vbActive )
+{
+ FnResult status;
+
+ bool bFoundDevice = false;
+ for( SDeviceFoundProxy &rDev : m_listDeviceTargets )
+ {
+ if( rDev.strDeviceName == rvDeviceName )
+ {
+ bFoundDevice = true;
+ rDev.bActiveTarget = vbActive;
+ m_pDeviceUserFirstChoice = &rDev;
+ break;
+ }
+ }
+ if( !bFoundDevice )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "Device '" + rvDeviceName;
+ status.strErrMsg += "' not found in list of available device targets";
+ }
+
+ return status;
+}
+
+//++
+// Details: Discovers all the SYCL target devices available and assigns them
+// to a target device proxy object. A device proxy object holds
+// the criteria for a real device. All proxies created are
+// disabled until a real device is found to match it. A programmer
+// has to still set the proxy device as active to target that
+// device it represent.
+//
+// Call this function before at the earliest opportunity and
+// before other functions in this class as it makes a list
+// of the target devices we are aiming to use.
+//
+// A limitation of this function it will only assign the first real device that
+// matches the proxy criteria. Any subsequent same or similar devices are
+// ignored.
+//
+// Type: Method.
+// Args: None.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::DiscoverDevsWeWant()
+{
+ FnResult status;
+
+ SDeviceFoundProxy accelerator{ eAccelerator, "accelerator", false };
+ SDeviceFoundProxy cpu{ eCPU, "cpu", false };
+ SDeviceFoundProxy gpu{ eGPU, "gpu", false };
+
+ for( const auto platform : platform::get_platforms() )
+ {
+ for( const auto device : platform.get_devices() )
+ {
+ // Get first available device of each type
+ if( !accelerator.bAvailable && device.is_accelerator() )
+ {
+ accelerator.bAvailable = true;
+ accelerator.theDevice = device;
+ }
+ else if( !cpu.bAvailable && device.is_cpu() )
+ {
+ cpu.bAvailable = true;
+ cpu.theDevice = device;
+ }
+ else if( !gpu.bAvailable && device.is_gpu() )
+ {
+ gpu.bAvailable = true;
+ gpu.theDevice = device;
+ }
+ }
+ }
+
+ m_listDeviceTargets.push_back( accelerator );
+ m_listDeviceTargets.push_back( cpu );
+ m_listDeviceTargets.push_back( gpu );
+
+ return status;
+}
+
+//++
+// Details: Returns the list of proxy device objects the programmer has
+// defined and wants found on the system. Some proxy objects
+// may be set to not available (and inactive) if not matching
+// devices has been found on the system.
+// Type: Method.
+// Args: None.
+// Return: ListDevicesFound_t - List of proxy device objects.
+// Throws: None.
+//--
+const CUtilDeviceTargets::ListDevicesFound_t & CUtilDeviceTargets::GetListDevs() const
+{
+ return m_listDeviceTargets;
+}
+
+//++
+// Details: Prints to std out all the SYCL device targets discovered on the
+// wanted to be used.
+// Type: Method.
+// Args: string& vrstrPlatformAndDevices - (W) A report of found devices.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices )
+{
+ FnResult status;
+
+ vrstrPlatformAndDevices = "";
+ bool bFoundPlatforms = false;
+ bool bFoundDevices = false;
+ for( const auto platform : platform::get_platforms() )
+ {
+ bFoundPlatforms = true;
+ vrstrPlatformAndDevices += "Platform: ";
+ vrstrPlatformAndDevices += platform.get_info< info::platform::name >();
+ vrstrPlatformAndDevices += "\n";
+
+ for( const auto device : platform.get_devices() )
+ {
+ bFoundDevices = true;
+ vrstrPlatformAndDevices += "\tDevice: ";
+ vrstrPlatformAndDevices += device.get_info< info::device::name >();
+ vrstrPlatformAndDevices += "\n";
+ }
+ }
+ if( !bFoundPlatforms && !bFoundDevices )
+ {
+ vrstrPlatformAndDevices = "No SYCL targeted platforms or devices found.";
+ }
+
+ return status;
+}
+
+//++
+// Details: Checks the user's input is valid. If not a help message if formed
+// and returned. If valid, the matching proxy device object
+// discovered earlier is made active for use by the program.
+// Type: Function.
+// Args: vrDevList- (RW) Utililty object managing proxy device objects.
+// argc - (R) Program's input arguments count.
+// argv - (R) Program's list of input arguments.
+// Return: FnResult - Status of the function's operational success.
+// Throws: None.
+//--
+FnResult UserCheckTheirInput( CUtilDeviceTargets &vrDevList, int argc, char* argv[] )
+{
+ FnResult status;
+
+ const CUtilDeviceTargets::ListDevicesFound_t &rDevs = vrDevList.GetListDevs();
+ string strListDevsOptionsToUser;
+ for( const SDeviceFoundProxy d : rDevs )
+ {
+ strListDevsOptionsToUser += d.strDeviceName + "|";
+ }
+ strListDevsOptionsToUser += CUtilDeviceTargets::GetInputOptionDiscoverDevice();
+
+ if( argc < 2 )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "Usage: " + string( argv[ 0 ] ) + " <";
+ status.strErrMsg += strListDevsOptionsToUser;
+ status.strErrMsg += ">";
+ return status;
+ }
+
+ bool bTargetDevMatch = false;
+ const string strArg{ argv[ 1 ] };
+ for( const SDeviceFoundProxy d : rDevs )
+ {
+ if( strArg == d.strDeviceName )
+ {
+ bTargetDevMatch = true;
+ status = vrDevList.SetDevToActive( strArg, true );
+ break;
+ }
+ }
+ if( status.bSuccess && !bTargetDevMatch &&
+ (strArg != CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "The device type cannot be found. Please enter a device type name from the list: ";
+ status.strErrMsg += strListDevsOptionsToUser;
+ }
+
+ return status;
+}
+
+//++
+// Details: Checks the user's input is the option to 'discover device target'
+// on the system.
+// Type: Function.
+// Args: argv - (R) Program's list of input arguments.
+// bool rbDoDiscovery - (W) True = yes, the discovery option choosen.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult UserWantsToDiscoverPossibleTargets( char* argv[], bool &rbDoDiscovery )
+{
+ FnResult status;
+
+ rbDoDiscovery = false;
+ const string strArg{ argv[ 1 ] };
+ if( strArg == CUtilDeviceTargets::GetInputOptionDiscoverDevice() )
+ {
+ string strPlatformAndDevicesReport;
+ status = CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( strPlatformAndDevicesReport );
+ if( status.bSuccess )
+ {
+ rbDoDiscovery = true;
+ cout << strPlatformAndDevicesReport << std::endl;
+ }
+ }
+
+ return status;
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000000..2b8e56ba4b
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+ "configurations": [
+ {
+ "name": "Linux",
+ "includePath": [
+ "${workspaceFolder}/**"
+ ],
+ "defines": [],
+ "compilerPath": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "compilerArgs": [ "-fsycl" ],
+ "cStandard": "gnu17",
+ "cppStandard": "gnu++17",
+ "intelliSenseMode": "linux-gcc-x64"
+ }
+ ],
+ "version": 4
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json
new file mode 100644
index 0000000000..325c8901a1
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/launch.json
@@ -0,0 +1,50 @@
+{
+ "configurations": [
+ {
+ "name": "C/C++: dpc++ build and debug scan",
+ "type": "cppdbg",
+ "request": "launch",
+ "program": "${workspaceFolder}/bin/${config:programName}_d",
+ "args": [
+ "${input:args}"
+ ],
+ "stopAtEntry": true,
+ "cwd": "${fileDirname}",
+ "environment": [],
+ "externalConsole": false,
+ "MIMode": "gdb",
+ "setupCommands": [
+ {
+ "description": "Enable pretty-printing for gdb",
+ "text": "-enable-pretty-printing",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Set Disassembly Flavor to Intel",
+ "text": "-gdb-set disassembly-flavor intel",
+ "ignoreFailures": true
+ },
+ {
+ "description": "Needed by Intel oneAPI: Disable target async",
+ "text": "set target-async off",
+ "ignoreFailures": true
+ }
+ ],
+ "preLaunchTask": "scan Debug C/C++: Intel icpx build active file",
+ "miDebuggerPath": "/opt/intel/oneapi/debugger/latest/gdb/intel64/bin/gdb-oneapi"
+ }
+ ],
+ "inputs" : [
+ {
+ "id": "args",
+ "type": "pickString",
+ "description": "Program args",
+ "default": "cpu",
+ "options": [
+ "cpu",
+ "gpu",
+ "accelerator"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json
new file mode 100644
index 0000000000..150d2ba955
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+ "programName": "scan",
+ "files.associations": {
+ "stdexcept": "cpp",
+ "array": "cpp",
+ "bitset": "cpp",
+ "string_view": "cpp",
+ "initializer_list": "cpp",
+ "regex": "cpp",
+ "utility": "cpp",
+ "algorithm": "cpp",
+ "iostream": "cpp"
+ }
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json
new file mode 100644
index 0000000000..0706d2eebb
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/.vscode/tasks.json
@@ -0,0 +1,49 @@
+{
+ "version": "2.0.0",
+ "tasks": [
+ {
+ "type": "cppbuild",
+ "label": "scan Debug C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-fdiagnostics-color=always",
+ "-fno-limit-debug-info",
+ "-fsycl-device-code-split=per_kernel",
+ "-g",
+ "-O0",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}_d"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/dpcpp"
+ },
+ {
+ "type": "cppbuild",
+ "label": "scan Release C/C++: Intel icpx build active file",
+ "command": "/opt/intel/oneapi/compiler/latest/linux/bin/icpx",
+ "args": [
+ "-fsycl",
+ "-DNDEBUG",
+ "${workspaceFolder}/src/${config:programName}.cpp",
+ "-o",
+ "${workspaceFolder}/bin/${config:programName}"
+ ],
+ "options": {
+ "cwd": "${workspaceFolder}"
+ },
+ "problemMatcher": [
+ "$gcc"
+ ],
+ "group": "build",
+ "detail": "compiler: /opt/intel/oneapi/compiler/latest/linux/bin/icpx"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt
new file mode 100644
index 0000000000..24aa89e2f2
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(WIN32)
+ set(CMAKE_CXX_COMPILER "dpcpp-cl")
+ set(CMAKE_C_COMPILER "dpcpp-cl")
+else()
+ set(CMAKE_CXX_COMPILER "icpx")
+endif()
+set(CMAKE_CXX_STANDARD 17)
+if(NOT DEFINED ${CMAKE_BUILD_TYPE})
+ set(CMAKE_BUILD_TYPE "RELEASE")
+endif()
+if( CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+cmake_minimum_required (VERSION 3.4)
+project (scan)
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md
new file mode 100644
index 0000000000..3f1e3f3720
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/README.md
@@ -0,0 +1,86 @@
+# Parallel Prefix Sum (Scan) program
+
+## Purpose
+A simple and common parallel algorithm building block is the all-prefix-sums operation. Blelloch (1990) describes all-prefix-sums as a good example of a computation that seems inherently sequential, but for which there is an efficient parallel algorithm.
+
+## Prerequisites
+
+| Minimum Requirements | Description
+|:--- |:---
+| OS | Linux* Ubuntu* 20.04.5 LTS
+| Hardware | Intel® 11th Gen Intel Core i7-1185G7 + Mesa Intel Xe Graphics
+| Compiler Toolchain | Visual Studio Code IDE, Intel oneAPI Base Toolkit (inc its prerequisite)
+| Libraries | Install Intel oneAPI Base Toolkit
+| Tools | Visual Studio Code 1.73.1, VSCode Microsoft C/C++ extns
+
+## Build and Run using Visual Code Studio
+
+### Linux*
+
+Within a terminal window change directory to this project's folder. At the
+terminal prompt type:
+
+```
+cd ParallelPrefixSumScan
+code .
+```
+
+Visual Studio Code will open this project displaying its files in the Explorer
+pane.
+The project is already set up with build configurations to build either a
+debug build or a release build of the program. When a program is built, it is
+placed in the bin directory of this project's top folder.
+
+To build the program hit Ctrl+Shift+b and choose the type of program to build.
+The debug executable will have a '_d' appended to its name.
+
+To execute the program, type in the Visual Studio Code terminal window:
+```
+cd bin
+./scan_d cpu
+```
+
+## Build and Run using CMake
+### Linux*
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+To execute the program, type in the terminal window:
+
+```
+cd build/src
+./scan cpu
+```
+
+
+## Debug the program using Visual Studio Code
+
+### Linux*
+
+To debug the program, either choose from the IDE's run menu
+'Start debugging' or hit F5 on the keyboard.
+The debug launch.json configuration file defines the debug session to:
+* Provide a list of accelerator type to choose from. Picking a device will
+ be used as the first argument to the program.
+* To halt the program at the first line of code after main().
+Use the GUI debug panel's buttons to step over code (key F10) lines to see the
+program advance.
+Breakpoints can be set either in the main code or the kernel code.
+
+Note: Setting breakpoints in the kernel code does not present the normal
+ step through code behavior. Instead a breakpoint event is occurring
+ on each thread being executed and so switches to the context of
+ that thread. To step through the code of a single thread, use the
+ Intel gdb-oneapi command 'set scheduler-locking step' or 'on' in the
+ IDE's debug console prompt. As this is not the main thread, be sure
+ to revert this setting on returning to debug any host side code.
+ Use the command 'set scheduler-locking replay' or 'off'.
+
+## License
+
+Code samples are licensed under the Apache 2.0 license. See
+[LICENSE.txt](LICENSE.txt) for details.
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt
new file mode 100644
index 0000000000..7dbc6eff3d
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fsycl")
+set(CMAKE_BUILD_TYPE "RelWithDebInfo")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+add_executable (scan scan.cpp)
+target_link_libraries(scan OpenCL sycl)
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp
new file mode 100644
index 0000000000..a515c547be
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/device_selector.hpp
@@ -0,0 +1,421 @@
+//==============================================================
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#pragma once
+
+// SYCL or oneAPI toolkit headers:
+#include
+
+// Third party headers:
+#include
+
+using namespace std;
+using namespace sycl;
+
+//++
+//============================================================================
+// Details: Common code utility. User defined enumerate of the SYCL device
+// target typesdesired.
+// An 'eHost' is admitted because SYCL 2020 depricates host device
+// selection.
+//--
+enum EDevsWeWant
+{
+ eNotValid = 0, // Default
+ eCPU = 1,
+ eGPU = 2,
+ eAccelerator = 3, // i.e. a FPGA type device
+ eCount = 4 // Always the last one
+};
+
+//++
+//============================================================================
+// Details: Common code utility. User defined target device proxy.
+// After the utility has discoverd available devices on the system,
+// this structure holds/caches information about the device.
+// Forms a proxy device object representing an actual possible target
+// device found on the system.
+//--
+struct SDeviceFoundProxy final
+{
+ EDevsWeWant eDevice = eNotValid; // The type of real device we want to
+ // use to run kernels on.
+ string strDeviceName = ""; // THe proxy label (ID) for a real
+ // device.
+ bool bAvailable = false; // True = can be used,
+ // False = not found on the system.
+ bool bActiveTarget = false; // True = use it, false = stop using.
+ int nScore = 0; // User defined score of the device.
+ sycl::device theDevice; // Copy of the real device found.
+};
+
+//++
+//============================================================================
+// Details: Common code utility. Rudimentry error reporting system. Used by
+// utility class to explicity aid the user or the programmer of any
+// issues that have occurred.
+//--
+struct FnResult final
+{
+ bool bSuccess = true;
+ string strErrMsg = "";
+};
+
+//++
+//============================================================================
+// Details: Common code utility. A basic utility class to wrap up functions
+// that can discover, then target the acceleration devices found on
+// a system.
+//
+// Docs: https://www.intel.com/content/www/us/en/developer/articles/
+// technical/device-discovery-with-sycl.html#gs.nhyd7s
+// https://registry.khronos.org/SYCL/specs/sycl-2020/html/
+// sycl-2020.html#sec:device-selection
+//
+// It can find all the available device targets on a system.
+// But it will only store a list of the first device found of the following
+// type and criteria: cpu, gpu and accelerator.
+// Each device in the list can be set to be an active target.
+// The last or latest call to function SetDevToActive() will change the
+// device proxy object returned by GetDevUsersFirstChoice() to be that device.
+// --
+class CUtilDeviceTargets final
+{
+ // Definitions:
+ public:
+ typedef std::vector< SDeviceFoundProxy > ListDevicesFound_t;
+
+ // Static method:
+ public:
+ static FnResult DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices );
+ static const string& GetInputOptionDiscoverDevice();
+ static FnResult GetQueuesCurrentDevice( const queue &vrQ, string &vrstr );
+
+ // Methods:
+ public:
+ CUtilDeviceTargets();
+ ~CUtilDeviceTargets();
+
+ FnResult DiscoverDevsWeWant();
+ const ListDevicesFound_t &GetListDevs() const;
+ const SDeviceFoundProxy *GetDevUsersFirstChoice() const;
+ FnResult SetDevToActive( const string &rvDeviceName, const bool vbActive );
+
+ // Attributes:
+ private:
+ ListDevicesFound_t m_listDeviceTargets;
+ static string m_strDiscoverDeviceInputOption;
+ SDeviceFoundProxy *m_pDeviceUserFirstChoice; // NULL = a choice has not been made
+};
+
+// Instantiations:
+string CUtilDeviceTargets::m_strDiscoverDeviceInputOption = "discover_devices";
+
+//++
+// Details: CUtilDeviceTargets constructor.
+// Type: Method.
+// Args: None.
+// Return: None.
+// Throws: None.
+//--
+CUtilDeviceTargets::CUtilDeviceTargets()
+: m_pDeviceUserFirstChoice( nullptr )
+{}
+
+//++
+// Details: CUtilDeviceTargets destructor.
+// Type: Method.
+// Args: None.
+// Return: None.
+// Throws: None.
+//--
+CUtilDeviceTargets::~CUtilDeviceTargets()
+{
+ // Release
+ m_pDeviceUserFirstChoice = nullptr;
+}
+
+//++
+// Details: Return a report on the specified SYCL queue stating its current
+// real target device and the device platform.
+//
+// Type: Method.
+// Args: vrQ - (R) The queue to query.
+// vrstrReport - (W) The report text.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::GetQueuesCurrentDevice( const queue &vrQ, string &vrstrReport )
+{
+ FnResult status;
+
+ vrstrReport = "[SYCL] Using device: [";
+ vrstrReport += vrQ.get_device().get_info< info::device::name >();
+ vrstrReport += "] from [";
+ vrstrReport += vrQ.get_device().get_platform().get_info< info::platform::name >();
+ vrstrReport += "]";
+
+ return status;
+}
+
+//++
+// Details: Returns the program's text label for the user input option
+// to choose to discover all the available device on a system.
+// Type: Method.
+// Args: None.
+// Return: string& - Text label.
+// Throws: None.
+//--
+const string& CUtilDeviceTargets::GetInputOptionDiscoverDevice()
+{
+ return m_strDiscoverDeviceInputOption;
+}
+
+//++
+// Details: Returns the pointer to the proxy object in the list of
+// discovery device proxies.
+// Type: Method.
+// Args: None.
+// Return: SDeviceFoundProxy* - pointer to object.
+// Throws: None.
+//--
+const SDeviceFoundProxy * CUtilDeviceTargets::GetDevUsersFirstChoice() const
+{
+ return m_pDeviceUserFirstChoice;
+}
+
+//++
+// Details: A pointer to the current user's choice of target is made on this
+// function being successful.
+//
+// If this function fails, the pointer retains the last or
+// remains NULL (a choice was never made).
+//
+// Type: Method.
+// Args: string &rvDeviceName - (R) Proxy's label or ID text.
+// bool vbActive - (R) True = Use the device, False = disable use.
+// Return: FnResult - Status of the function's operational success.
+// Throws: None.
+//--
+FnResult CUtilDeviceTargets::SetDevToActive( const string &rvDeviceName, const bool vbActive )
+{
+ FnResult status;
+
+ bool bFoundDevice = false;
+ for( SDeviceFoundProxy &rDev : m_listDeviceTargets )
+ {
+ if( rDev.strDeviceName == rvDeviceName )
+ {
+ bFoundDevice = true;
+ rDev.bActiveTarget = vbActive;
+ m_pDeviceUserFirstChoice = &rDev;
+ break;
+ }
+ }
+ if( !bFoundDevice )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "Device '" + rvDeviceName;
+ status.strErrMsg += "' not found in list of available device targets";
+ }
+
+ return status;
+}
+
+//++
+// Details: Discovers all the SYCL target devices available and assigns them
+// to a target device proxy object. A device proxy object holds
+// the criteria for a real device. All proxies created are
+// disabled until a real device is found to match it. A programmer
+// has to still set the proxy device as active to target that
+// device it represent.
+//
+// Call this function before at the earliest opportunity and
+// before other functions in this class as it makes a list
+// of the target devices we are aiming to use.
+//
+// A limitation of this function it will only assign the first real device that
+// matches the proxy criteria. Any subsequent same or similar devices are
+// ignored.
+//
+// Type: Method.
+// Args: None.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::DiscoverDevsWeWant()
+{
+ FnResult status;
+
+ SDeviceFoundProxy accelerator{ eAccelerator, "accelerator", false };
+ SDeviceFoundProxy cpu{ eCPU, "cpu", false };
+ SDeviceFoundProxy gpu{ eGPU, "gpu", false };
+
+ for( const auto platform : platform::get_platforms() )
+ {
+ for( const auto device : platform.get_devices() )
+ {
+ // Get first available device of each type
+ if( !accelerator.bAvailable && device.is_accelerator() )
+ {
+ accelerator.bAvailable = true;
+ accelerator.theDevice = device;
+ }
+ else if( !cpu.bAvailable && device.is_cpu() )
+ {
+ cpu.bAvailable = true;
+ cpu.theDevice = device;
+ }
+ else if( !gpu.bAvailable && device.is_gpu() )
+ {
+ gpu.bAvailable = true;
+ gpu.theDevice = device;
+ }
+ }
+ }
+
+ m_listDeviceTargets.push_back( accelerator );
+ m_listDeviceTargets.push_back( cpu );
+ m_listDeviceTargets.push_back( gpu );
+
+ return status;
+}
+
+//++
+// Details: Returns the list of proxy device objects the programmer has
+// defined and wants found on the system. Some proxy objects
+// may be set to not available (and inactive) if not matching
+// devices has been found on the system.
+// Type: Method.
+// Args: None.
+// Return: ListDevicesFound_t - List of proxy device objects.
+// Throws: None.
+//--
+const CUtilDeviceTargets::ListDevicesFound_t & CUtilDeviceTargets::GetListDevs() const
+{
+ return m_listDeviceTargets;
+}
+
+//++
+// Details: Prints to std out all the SYCL device targets discovered on the
+// wanted to be used.
+// Type: Method.
+// Args: string& vrstrPlatformAndDevices - (W) A report of found devices.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( string &vrstrPlatformAndDevices )
+{
+ FnResult status;
+
+ vrstrPlatformAndDevices = "";
+ bool bFoundPlatforms = false;
+ bool bFoundDevices = false;
+ for( const auto platform : platform::get_platforms() )
+ {
+ bFoundPlatforms = true;
+ vrstrPlatformAndDevices += "Platform: ";
+ vrstrPlatformAndDevices += platform.get_info< info::platform::name >();
+ vrstrPlatformAndDevices += "\n";
+
+ for( const auto device : platform.get_devices() )
+ {
+ bFoundDevices = true;
+ vrstrPlatformAndDevices += "\tDevice: ";
+ vrstrPlatformAndDevices += device.get_info< info::device::name >();
+ vrstrPlatformAndDevices += "\n";
+ }
+ }
+ if( !bFoundPlatforms && !bFoundDevices )
+ {
+ vrstrPlatformAndDevices = "No SYCL targeted platforms or devices found.";
+ }
+
+ return status;
+}
+
+//++
+// Details: Checks the user's input is valid. If not a help message if formed
+// and returned. If valid, the matching proxy device object
+// discovered earlier is made active for use by the program.
+// Type: Function.
+// Args: vrDevList- (RW) Utililty object managing proxy device objects.
+// argc - (R) Program's input arguments count.
+// argv - (R) Program's list of input arguments.
+// Return: FnResult - Status of the function's operational success.
+// Throws: None.
+//--
+FnResult UserCheckTheirInput( CUtilDeviceTargets &vrDevList, int argc, char* argv[] )
+{
+ FnResult status;
+
+ const CUtilDeviceTargets::ListDevicesFound_t &rDevs = vrDevList.GetListDevs();
+ string strListDevsOptionsToUser;
+ for( const SDeviceFoundProxy d : rDevs )
+ {
+ strListDevsOptionsToUser += d.strDeviceName + "|";
+ }
+ strListDevsOptionsToUser += CUtilDeviceTargets::GetInputOptionDiscoverDevice();
+
+ if( argc < 2 )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "Usage: " + string( argv[ 0 ] ) + " <";
+ status.strErrMsg += strListDevsOptionsToUser;
+ status.strErrMsg += ">";
+ return status;
+ }
+
+ bool bTargetDevMatch = false;
+ const string strArg{ argv[ 1 ] };
+ for( const SDeviceFoundProxy d : rDevs )
+ {
+ if( strArg == d.strDeviceName )
+ {
+ bTargetDevMatch = true;
+ status = vrDevList.SetDevToActive( strArg, true );
+ break;
+ }
+ }
+ if( status.bSuccess && !bTargetDevMatch &&
+ (strArg != CUtilDeviceTargets::GetInputOptionDiscoverDevice() ) )
+ {
+ status.bSuccess = false;
+ status.strErrMsg = "The device type cannot be found. Please enter a device type name from the list: ";
+ status.strErrMsg += strListDevsOptionsToUser;
+ }
+
+ return status;
+}
+
+//++
+// Details: Checks the user's input is the option to 'discover device target'
+// on the system.
+// Type: Function.
+// Args: argv - (R) Program's list of input arguments.
+// bool rbDoDiscovery - (W) True = yes, the discovery option choosen.
+// Return: FnResult - Status of the function's operational success.
+// Throws: SYCL implemenation may throw.
+//--
+FnResult UserWantsToDiscoverPossibleTargets( char* argv[], bool &rbDoDiscovery )
+{
+ FnResult status;
+
+ rbDoDiscovery = false;
+ const string strArg{ argv[ 1 ] };
+ if( strArg == CUtilDeviceTargets::GetInputOptionDiscoverDevice() )
+ {
+ string strPlatformAndDevicesReport;
+ status = CUtilDeviceTargets::DiscoverPlatformsDevicesAvailable( strPlatformAndDevicesReport );
+ if( status.bSuccess )
+ {
+ rbDoDiscovery = true;
+ cout << strPlatformAndDevicesReport << std::endl;
+ }
+ }
+
+ return status;
+}
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp
new file mode 100644
index 0000000000..d179444ee4
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/ParallelPrefixSumScan/src/scan.cpp
@@ -0,0 +1,520 @@
+//============================================================================
+// Copyright © 2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// ===========================================================================
+
+//****************************************************************************
+//
+// Description:
+// Example of a parallel inclusive scan in SYCL. Based on the two-phase
+// exclusive scan algorithm paper by Guy E. Blelloch titled "Prefix Sums and
+// Their Applications", 1990.
+//
+// Usage:
+// The program takes one argument: host / cpu / gpu / accelerator.
+//
+//*****************************************************************************
+
+// SYCL / Intel oneAPI files:
+#include
+#include "dpc_common.hpp"
+
+// Third party files:
+#include
+#include
+#include
+#include
+
+// This project's files:
+#include "device_selector.hpp"
+
+using namespace sycl;
+using namespace std;
+
+// Asynchronous errors hander, catch faults in asynchronously executed code
+// inside a command group or a kernel. They can occur in a different stackframe,
+// asynchronous error cannot be propagated up the stack.
+// By default, they are considered 'lost'. The way in which we can retrieve them
+// is by providing an error handler function.
+auto exception_handler = []( sycl::exception_list exceptions )
+{
+ for( std::exception_ptr const &e : exceptions )
+ {
+ try
+ {
+ std::rethrow_exception( e );
+ }
+ catch( sycl::exception const &e )
+ {
+ std::cout << "Queue handler caught asynchronous SYCL exception:\n" << e.what() << std::endl;
+ }
+ }
+};
+
+// Forward decleration of functions
+template< typename T, typename OP >
+void ParallelScan( sycl::buffer< T, 1 > &bufIn, sycl::queue &q );
+int TestSum( sycl::queue &q );
+int TestFactorial( sycl::queue &q );
+
+int main( int argc, char *argv[] )
+{
+ CUtilDeviceTargets utilsDev;
+ FnResult fnResult = utilsDev.DiscoverDevsWeWant();
+ if( !fnResult.bSuccess )
+ {
+ cerr << "Program failure: Unable to discover target devices on this platform.\n";
+ exit( -1 );
+ }
+
+ fnResult = UserCheckTheirInput( utilsDev, argc, argv );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( 1 );
+ }
+
+ bool bDoDevDiscovery = false;
+ fnResult = UserWantsToDiscoverPossibleTargets( argv, bDoDevDiscovery );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( -1 );
+ }
+ if( bDoDevDiscovery ) exit( 1 );
+
+ const SDeviceFoundProxy *pUsersChosenDevice = utilsDev.GetDevUsersFirstChoice();
+ if( pUsersChosenDevice == nullptr )
+ {
+ cerr << "Program failure: Did not create a valid target device object.\n";
+ exit( -1 );
+ }
+
+ int retResultSum = 0;
+ int retResultFactorial = 0;
+
+ try
+ {
+ queue myQueue( pUsersChosenDevice->theDevice, exception_handler );
+
+ string strTheDeviceBeingUsed;
+ fnResult = CUtilDeviceTargets::GetQueuesCurrentDevice( myQueue, strTheDeviceBeingUsed );
+ if( !fnResult.bSuccess )
+ {
+ cerr << fnResult.strErrMsg << "\n";
+ exit( -1 );
+ }
+ cout << strTheDeviceBeingUsed << "\n";
+
+ retResultSum = TestSum( myQueue );
+ retResultFactorial = (retResultSum == 0) && TestFactorial( myQueue );
+ }
+ catch( sycl::exception const &e )
+ {
+ cout << "Fail; SYCL synchronous exception occurred: " << e.what() << "\n";
+ return -1;
+ }
+ catch( std::exception const &e )
+ {
+ cout << "Fail; Runtime synchronous exception occurred: " << e.what() << "\n";
+ return -1;
+ }
+
+ if( (retResultSum != 0) || (retResultFactorial != 0) )
+ {
+ return 1;
+ }
+
+ cout << "Results are correct." << std::endl;
+
+ return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+// The identity element for a given operation.
+template< typename T, typename OP >
+struct SIdentity {};
+
+template< typename T >
+struct SIdentity< T, std::plus< T > >
+{
+ static constexpr T value = 0;
+};
+
+template< typename T >
+struct SIdentity< T, std::multiplies< T > >
+{
+ static constexpr T value = 1;
+};
+
+template< typename T >
+struct SIdentity< T, std::logical_or< T > >
+{
+ static constexpr T value = false;
+};
+
+template< typename T >
+struct SIdentity< T, std::logical_and< T > >
+{
+ static constexpr T value = true;
+};
+
+// Dummy struct to generate unique kernel name types
+template< typename T, typename U, typename V >
+struct SKernelNameType {};
+
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+// Performs an inclusive scan with the given associative binary operation `OP`
+// on the data in the `bufIn` buffer. Runs in parallel on the provided
+// accelerated hardware queue. Modifies the input buffer to contain the
+// results of the scan.
+// Input size has to be a power of two. If the size isn't so, the input can
+// easily be padded to the nearest power of two with any values, and the scan
+// on the meaningful part of the data will stay the same.
+template< typename T, typename OP >
+void ParallelScan( sycl::buffer< T, 1 > &bufIn, sycl::queue &q )
+{
+ // Retrieve the device associated with the given queue.
+ const sycl::device dev = q.get_device();
+ const bool bHwIsCpu = dev.is_cpu();
+
+ // Check if local memory is available. On host no local memory is fine, since
+ if( !bHwIsCpu &&
+ (dev.get_info< sycl::info::device::local_mem_type >() ==
+ sycl::info::local_mem_type::none) )
+ {
+ throw std::runtime_error( "Non host device does not have local memory." );
+ }
+
+ const size_t bufSize = bufIn.size();
+ if( ((bufSize & (bufSize - 1)) != 0) || (bufSize == 0) )
+ {
+ throw std::runtime_error(
+ "Given input buffer size is not a power of two." );
+ }
+
+ // Check if there is enough global memory.
+ const size_t globalMemSize =
+ dev.get_info< sycl::info::device::global_mem_size >();
+ if( !bHwIsCpu && (bufSize > (globalMemSize * 0.5) ) )
+ {
+ throw std::runtime_error(
+ "Non host device input size exceeds device global memory size." );
+ }
+
+ // Obtain device limits.
+ const size_t maxWgroupSize =
+ dev.get_info< sycl::info::device::max_work_group_size >();
+ const size_t localMemSize =
+ dev.get_info< sycl::info::device::local_mem_size >();
+
+ // Find a work-group size that is guaranteed to fit in local memory and is
+ // below the maximum work-group size of the device.
+ const size_t wgroupSizeLim =
+ sycl::min( maxWgroupSize, localMemSize / (2 * sizeof( T )) );
+
+ // Every work-item processes two elements, so the work-group size has to
+ // divide this number evenly.
+ const size_t halfInBufSize = bufSize * 0.5;
+
+ // Find the largest power of two that divides half_in_size and is within the
+ // device limit.
+ size_t wgroupSize = 0;
+ size_t pow = size_t( 1 ) << (sizeof( size_t ) * 8 - 1);
+ for( ; pow > 0; pow >>= 1 )
+ {
+ if( (halfInBufSize / pow) * pow ==
+ halfInBufSize && (pow <= wgroupSizeLim) )
+ {
+ wgroupSize = pow;
+ break;
+ }
+ }
+ if( wgroupSize == 0 )
+ {
+ throw std::runtime_error(
+ "Could not find an appropriate work-group size for the given input." );
+ }
+ const size_t dblWgrpSize = wgroupSize * 2;
+
+ q.submit( [&]( sycl::handler &cgh )
+ {
+ const auto ptrData =
+ bufIn.template get_access< sycl::access::mode::read_write >( cgh );
+
+ // Using scratch/local memory (to a work group) for faster memory
+ // access to compute the results
+ sycl::accessor< T, 1, sycl::access::mode::read_write,
+ sycl::access::target::local >
+ scratch( wgroupSize * 2, cgh);
+
+ // Use dummy struct as the unique kernel name.
+ cgh.parallel_for< SKernelNameType< T, OP, class CScanSegments > >(
+ sycl::nd_range< 1 >( halfInBufSize, wgroupSize ),
+ [=]( sycl::nd_item< 1 > item )
+ {
+ const size_t gid = item.get_global_linear_id();
+ const size_t lid = item.get_local_linear_id();
+
+ // Read data into local memory.
+ scratch[ 2 * lid ] = ptrData[ 2 * gid ];
+ scratch[ 2 * lid + 1 ] = ptrData[ 2 * gid + 1 ];
+
+ // Preserve the second input element to add at the end.
+ const auto secondInput = scratch[ 2 * lid + 1 ];
+
+ // Perform partial reduction (up-sweep) on the data. The `off`
+ // variable is 2 to the power of the current depth of the
+ // reduction tree. In the paper, this corresponds to 2^d.
+ for( size_t off = 1; off < (wgroupSize * 2); off *= 2 )
+ {
+ // Synchronize local memory to observe the previous writes.
+ item.barrier( sycl::access::fence_space::local_space );
+
+ const size_t i = lid * off * 2;
+ if( i < dblWgrpSize )
+ {
+ const size_t index = i + (off * 2) - 1;
+ scratch[ index ] =
+ OP{}( scratch[ index ], scratch[ i + off - 1 ] );
+ }
+ }
+
+ // Clear the last element to the identity before down-sweeping.
+ if( lid == 0 )
+ {
+ scratch[ dblWgrpSize - 1 ] = SIdentity< T, OP >::value;
+ }
+
+ // Perform down-sweep on the tree to compute the whole scan.
+ // Again, `off` is 2^d.
+ for( size_t off = wgroupSize; off > 0; off >>= 1 )
+ {
+ item.barrier( sycl::access::fence_space::local_space );
+
+ const size_t i = lid * off * 2;
+ if( i < dblWgrpSize )
+ {
+ const size_t indexT = i + off - 1;
+ const size_t indexU = i + (off * 2) - 1;
+ const auto t = scratch[ indexT ];
+ const auto u = scratch[ indexU ];
+ scratch[ indexT ] = u;
+ scratch[ indexU ] = OP{}( t, u );
+ }
+ }
+
+ // Synchronize again to observe results.
+ item.barrier( sycl::access::fence_space::local_space );
+
+ // To return an inclusive rather than exclusive scan result, shift
+ // each element left by 1 when writing back into global memory. If
+ // we are the last work-item, also add on the final element.
+ const size_t indexL1 = 2 * lid + 1;
+ const size_t indexL2 = 2 * lid + 2;
+ const size_t indexG1 = 2 * gid;
+ const size_t indexG2 = 2 * gid + 1;
+ ptrData[ indexG1 ] = scratch[ indexL1 ];
+ if( lid == wgroupSize - 1 )
+ {
+ ptrData[ indexG2 ] = OP{}( scratch[ indexL1 ], secondInput );
+ }
+ else
+ {
+ ptrData[ indexG2 ] = scratch[ indexL2 ];
+ }
+ } // [=]( sycl::nd_item< 1 > item )
+ ); // cgh.parallel_for< SKernelNameType< T, OP, class CScanSegments > >(
+ }); // q.submit( [&]( sycl::handler &cgh )
+
+ // At this point we have computed the inclusive scans of this many segments.
+ const size_t nSegments = halfInBufSize / wgroupSize;
+
+ if( nSegments == 1 )
+ {
+ // If all of the data is in one segment, we're done.
+ return;
+ }
+ // Otherwise we have to propagate the scan results forward into later
+ // segments.
+
+ // Allocate space for one (last) element per segment.
+ sycl::buffer< T, 1 > bufEndSegment{ sycl::range< 1 >( nSegments ) };
+
+ // Store the elements in this space.
+ q.submit( [&](sycl::handler &cgh )
+ {
+ const auto ptrScans = bufIn.template get_access<
+ sycl::access::mode::read >( cgh );
+ const auto ptrElems = bufEndSegment.template get_access<
+ sycl::access::mode::discard_write >( cgh );
+
+ cgh.parallel_for< SKernelNameType< T, OP, class CCopyEndSeg > >(
+ sycl::range< 1 >( nSegments ),
+ [=]( sycl::item< 1 > item )
+ {
+ const size_t id = item.get_linear_id();
+ // Offset into the last element of each segment.
+ ptrElems[ item ] = ptrScans[ (id + 1) * 2 * wgroupSize - 1 ];
+ });
+ });
+
+ // Recursively scan the array of last elements.
+ ParallelScan< T, OP >( bufEndSegment, q );
+
+ // Add the results of the scan to each segment.
+ q.submit( [&]( sycl::handler &cgh )
+ {
+ const auto ptrEndSegScan = bufEndSegment.template get_access<
+ sycl::access::mode::read >( cgh );
+ const auto ptrDataIn = bufIn.template get_access<
+ sycl::access::mode::read_write >( cgh );
+
+ cgh.parallel_for< SKernelNameType< T, OP, class CAddEndSeg > >(
+ // Work with one less work-group, since the first segment is correct.
+ sycl::nd_range< 1 >( halfInBufSize - wgroupSize, wgroupSize ),
+ [=](sycl::nd_item< 1 > item)
+ {
+ const size_t grpLinId = item.get_group_linear_id();
+
+ // Start with the second segment.
+ const size_t glbIdOff = item.get_global_linear_id() + wgroupSize;
+
+ // Each work-group adds the corresponding number in the
+ // "last element scan" array to every element in the group's
+ // segment.
+ ptrDataIn[ glbIdOff * 2 ] = OP{}( ptrDataIn[ glbIdOff * 2 ],
+ ptrEndSegScan[ grpLinId ] );
+ ptrDataIn[ glbIdOff * 2 + 1 ] = OP{}( ptrDataIn[ glbIdOff * 2 + 1 ],
+ ptrEndSegScan[ grpLinId ] );
+ });
+ });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+// Tests the scan with an addition operation, which is its most common use.
+// Returns 0 if successful, a nonzero value otherwise.
+int TestSum( sycl::queue &q )
+{
+ constexpr size_t size = 64;
+
+ // Initializes a vector of sequentially increasing values.
+ std::vector< int32_t > arrayIn( size );
+ std::iota( arrayIn.begin(), arrayIn.end(), 1 );
+
+ // Compute the prefix sum using SYCL.
+ std::vector< int32_t > arraySum( arrayIn.size() );
+
+ {
+ // Read from `arrayIn`, but write into `arraySum`.
+ buffer< int32_t, 1 > bufArrayIn( sycl::range< 1 >( arrayIn.size() ) );
+ bufArrayIn.set_final_data( arraySum.data() );
+
+ q.submit( [&](sycl::handler &cgh)
+ {
+ const auto acc =
+ bufArrayIn.get_access< sycl::access::mode::write >( cgh );
+ cgh.copy( arrayIn.data(), acc );
+ });
+
+ ParallelScan< int32_t, std::plus< int32_t > >( bufArrayIn, q );
+ }
+
+ // Compute the same operation using the standard library.
+ std::vector < int32_t > arrayTestSum( arrayIn.size() );
+ std::partial_sum( arrayIn.begin(), arrayIn.end(), arrayTestSum.begin() );
+
+ cout << "\nSYCL compute's sum results:\n";
+ for( auto a : arraySum )
+ {
+ cout << a << " ";
+ }
+ cout << std::endl;
+
+ // Check if the results are correct.
+ const bool bEqual =
+ std::equal( arraySum.begin(), arraySum.end(), arrayTestSum.begin() );
+ if( !bEqual )
+ {
+ cout << "SYCL sum computation incorrect!\n";
+ cout << "std::partial_sum's results:\n";
+
+ for( auto a : arrayTestSum )
+ {
+ cout << a << " ";
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+// Tests the scan with a multiply operation, which is a sequence of factorials.
+// Returns 0 if successful, a nonzero value otherwise.
+int TestFactorial( sycl::queue &q )
+{
+ // Anything above this size overflows the int64_t type
+ constexpr size_t size = 16;
+
+ // Initializes a vector of sequentially increasing values.
+ std::vector< int64_t > arrayIn( size );
+ std::iota( arrayIn.begin(), arrayIn.end(), 1 );
+
+ // Compute a sequence of factorials using SYCL.
+ std::vector< int64_t > arrayFact( arrayIn.size() );
+ {
+ // Read from `arrayIn`, but write into `arrayFact`.
+ sycl::buffer< int64_t, 1 > bufArrayIn( sycl::range< 1 >( arrayIn.size() ));
+ bufArrayIn.set_final_data( arrayFact.data() );
+ q.submit( [&](sycl::handler &cgh )
+ {
+ const auto acc = bufArrayIn.get_access< sycl::access::mode::write >( cgh );
+ cgh.copy( arrayIn.data(), acc );
+ });
+
+ ParallelScan< int64_t, std::multiplies< int64_t > >( bufArrayIn, q );
+ }
+
+ // Compute the same operation using the standard library.
+ std::vector< int64_t > arrayTestFact( arrayIn.size() );
+ std::partial_sum( arrayIn.begin(), arrayIn.end(), arrayTestFact.begin(),
+ std::multiplies< int64_t >{} );
+
+ cout << "\nSYCL compute's factorial results:\n";
+ for( auto a : arrayFact )
+ {
+ cout << a << " ";
+ }
+ cout << std::endl;
+
+ // Check if the results are correct.
+ const bool bEqual = std::equal( arrayFact.begin(), arrayFact.end(),
+ arrayTestFact.begin() );
+ if( !bEqual )
+ {
+ cout << "SYCL factorial computation incorrect!\n";
+ cout << "std::partial_sum's results:\n";
+
+ for( auto a : arrayTestFact )
+ {
+ cout << a << " ";
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
\ No newline at end of file