diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml new file mode 100644 index 0000000000..3e8e0b4dd0 --- /dev/null +++ b/.github/workflows/build-nabla.yml @@ -0,0 +1,269 @@ +name: Build Nabla Workflow + +on: + push: + pull_request: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: push-lock-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-windows: + runs-on: windows-2022 + + env: + image: ghcr.io/devsh-graphics-programming/docker-nanoserver-msvc-winsdk + entry: pwsh.exe + cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass + mount: C:\mount\nabla + binary: C:\mount\nabla\build-ct + install: build-ct\install + + strategy: + fail-fast: false + matrix: + # vendor: [msvc, clangcl] + # TODO: Yas please fix ClangCL, we have a few new compile errors + # if we build MSVC then build "run-compiler-explorer" target, for ClangCL build just "nsc" + vendor: [msvc] + config: [Release, Debug, RelWithDebInfo] + tag: ['17.13.6'] + + steps: + - name: Environment Setup + run: | + Add-MpPreference -ExclusionPath "${{ github.workspace }}" + Add-MpPreference -ExclusionExtension "*.*" + Add-MpPreference -ExclusionProcess "docker.exe" + Add-MpPreference -ExclusionProcess "dockerd.exe" + Set-MpPreference -RemediationScheduleDay 8 + Set-MpPreference -DisableRealtimeMonitoring $true + Set-MpPreference -DisableRemovableDriveScanning $true + Set-MpPreference -DisableArchiveScanning $true + Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } + + - name: Set prefix + id: set-prefix + shell: pwsh + run: | + $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}" + $owner = "${{ github.repository_owner }}" + $package = "nabla-shader-compiler-godbolt" + $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}" + $nscTargetTaggedImage = "ghcr.io/${owner}/${package}:${tag}".ToLower() + $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower() + + $shouldPushImage = ( + "${{ github.ref }}" -eq "refs/heads/master" -and + "${{ matrix.vendor }}" -eq "msvc" -and + "${{ matrix.config }}" -eq "Release" + ) + + Write-Host "::notice::Should push image? $shouldPushImage" + + "prefix=$prefix" >> $env:GITHUB_OUTPUT + "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT + "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT + "shouldPushImage=$shouldPushImage" >> $env:GITHUB_OUTPUT + + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: 'recursive' + + - name: Pull Image + run: | + docker pull "${{ env.image }}:${{ matrix.tag }}" + + - name: Run Container + run: | + $ctx = docker context show + $dockerHost = (docker context inspect $ctx | ConvertFrom-Json).Endpoints.docker.Host + $pipeName = [regex]::Match($dockerHost, '/pipe/(?.+)$').Groups['n'].Value + $pipeHost = "\\.\pipe\$pipeName" + + docker run ` + --entrypoint ${{ env.entry }} -di --isolation process ` + --env-file .\docker\ci-windows.env ` + --env-file .\docker\ninja.env ` + --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` + --name orphan --network docker_default ` + -v "${{ github.workspace }}:${{ env.mount }}" ` + -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` + -w "${{ env.mount }}" ` + "${{ env.image }}:${{ matrix.tag }}" ` + ${{ env.cmd }} + + - name: Inspect Container + run: | + docker inspect orphan + + - name: Container – Unpack Packages + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} C:\unpack.ps1 + + - name: Container – Configure Project with CMake + run: | + mkdir profiling + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake ` + --preset ci-configure-dynamic-${{ matrix.vendor }} ` + --profiling-output=profiling/cmake-profiling.json ` + --profiling-format=google-trace + + - name: Container – Build NSC + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --build ` + --preset ci-build-dynamic-${{ matrix.vendor }} ` + -t run-compiler-explorer --config ${{ matrix.config }} + + - name: Container – Install NSC + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --install ` + ${{ env.binary }} --config ${{ matrix.config }} ` + --component Runtimes --prefix ${{ env.install }} + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command cmake --install ` + ${{ env.binary }} --config ${{ matrix.config }} ` + --component Executables --prefix ${{ env.install }} + + - name: Container – Save NSC Image + run: | + docker exec orphan ` + ${{ env.entry }} ${{ env.cmd }} -Command docker ` + save ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} | zstd -T0 -3 -f -o ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst + + - name: Package left workflow artifacts + run: | + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling + tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }} + + - name: Upload NSC Godbolt Image artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image + path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst + compression-level: 0 + + - name: Upload profiling artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-profiling + path: ${{ steps.set-prefix.outputs.prefix }}-profiling.tar + + - name: Upload install artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.set-prefix.outputs.prefix }}-install + path: ${{ steps.set-prefix.outputs.prefix }}-install.tar + + - name: Login to GHCR + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: echo "${{ secrets.CR_PAT }}" | docker login ghcr.io -u $env:GITHUB_ACTOR --password-stdin + + - name: Tag Latest image + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: | + docker tag ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} + + - name: Push images to GHCR + if: steps.set-prefix.outputs.shouldPushImage == 'True' + run: | + docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }} + + update-badges: + name: Update Build & Image Badges + if: ${{ always() && github.ref == 'refs/heads/master' }} + needs: build-windows + runs-on: windows-2022 + permissions: + contents: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Create Build Badge + run: | + $jobStatus = "${{ needs.build-windows.result }}" + $buildMsg = if ($jobStatus -eq "success") { "passing" } else { "failing" } + $buildColor = if ($jobStatus -eq "success") { "brightgreen" } else { "red" } + + $buildBadge = @{ + schemaVersion = 1 + label = "build" + message = $buildMsg + color = $buildColor + } | ConvertTo-Json -Depth 2 + + $buildPath = ".badge-public/nabla" + New-Item -ItemType Directory -Path $buildPath -Force | Out-Null + $buildBadge | Set-Content -Path "$buildPath/build.json" -Encoding utf8 + + - name: Create Image Size Badge + run: | + $owner = "${{ github.repository_owner }}" + $package = "nabla-shader-compiler-godbolt" + $image = "ghcr.io/${owner}/${package}:latest".ToLower() + $manifest = docker manifest inspect $image | ConvertFrom-Json + + if ($manifest.manifests) { + $totalSize = ($manifest.manifests | Measure-Object -Property size -Sum).Sum + } elseif ($manifest.layers) { + $totalSize = ($manifest.layers | Measure-Object -Property size -Sum).Sum + } else { + Write-Error "No valid size information found in manifest." + exit 1 + } + + $sizeMB = [Math]::Round($totalSize / 1MB, 2) + $size = "$sizeMB MB" + + $imageBadge = @{ + schemaVersion = 1 + label = $image + message = $size + color = "blue" + } | ConvertTo-Json -Depth 2 + + $imagePath = ".badge-public/packages/nabla-shader-compiler-nsc" + New-Item -ItemType Directory -Path $imagePath -Force | Out-Null + $imageBadge | Set-Content -Path "$imagePath/image-badge.json" -Encoding utf8 + + - name: Deploy Badges + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_branch: badges + publish_dir: .badge-public + keep_files: true + commit_message: "[CI] badges update" + + deploy-production: + name: Deploy to production host + if: ${{ always() && github.ref == 'refs/heads/master' }} + needs: build-windows + runs-on: ubuntu-latest + + steps: + - name: Pull latest images, re-run containers + uses: appleboy/ssh-action@v1 + with: + host: ${{ secrets.CE_HOST }} + username: ${{ secrets.CE_USER }} + key: ${{ secrets.CE_KEY }} + script: | + powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -NoExit -File C:\Scripts\startup-docker.ps1 diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml new file mode 100644 index 0000000000..d5f9f74c2b --- /dev/null +++ b/.github/workflows/run-nsc.yml @@ -0,0 +1,264 @@ +name: Run NSC Godbolt Container + +on: + workflow_dispatch: + inputs: + run_id: + description: "The id of the workflow run where the desired download artifact was uploaded from" + required: true + build_config: + description: "Build configuration (Release / RelWithDebInfo / Debug)" + required: true + default: "Release" + type: choice + options: + - Release + - RelWithDebInfo + - Debug + tunnelDurationHours: + description: "Hours amount the restricted tunnel should stay up" + required: true + default: "1" + type: choice + options: + - "1" + - "2" + - "3" + - "4" + - "5" + withDiscordMSG: + description: "Send Discord message after tunnel is up" + required: true + default: true + type: boolean + +jobs: + run-container: + runs-on: windows-2022 + env: + DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }} + + steps: + - name: Environment Setup + run: | + Add-MpPreference -ExclusionPath "${{ github.workspace }}" + Add-MpPreference -ExclusionExtension "*.*" + Add-MpPreference -ExclusionProcess "docker.exe" + Add-MpPreference -ExclusionProcess "dockerd.exe" + Set-MpPreference -RemediationScheduleDay 8 + Set-MpPreference -DisableRealtimeMonitoring $true + Set-MpPreference -DisableRemovableDriveScanning $true + Set-MpPreference -DisableArchiveScanning $true + Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true + + if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) { + docker network create --driver nat docker_default + if ($LASTEXITCODE -ne 0) { exit 1 } + } + + $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true" + Write-Host "::notice::Should send discord message? $sendDiscord" + + - name: Download Restricted Reverse Proxy binaries, setup NGINX config + run: | + Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe + Invoke-WebRequest -Uri "https://nginx.org/download/nginx-1.24.0.zip" -OutFile nginx.zip + Expand-Archive nginx.zip -DestinationPath nginx + + Remove-Item -Recurse -Force "nginx/nginx-1.24.0/conf" + New-Item -ItemType Directory -Path "nginx/nginx-1.24.0/conf" -Force | Out-Null + + '${{ secrets.NSC_BASIC_AUTH_HTPASSWD }}' | Out-File nginx/nginx-1.24.0/conf/.htpasswd -Encoding ascii + $htpasswdPath = (Resolve-Path "nginx/nginx-1.24.0/conf/.htpasswd").Path -replace '\\', '/' + + @" + events {} + + http { + server { + listen 10241; + + location / { + auth_basic "Restricted Compiler Explorer access for Development & NSC Artifact Tests, downloaded from Nabla actions pipeline"; + auth_basic_user_file "$htpasswdPath"; + + proxy_pass http://127.0.0.1:10240; + proxy_set_header Host `$host; + proxy_set_header X-Real-IP `$remote_addr; + } + } + } + "@ | Out-File nginx/nginx-1.24.0/conf/nginx.conf -Encoding ascii + + Write-Host "::group::Generated nginx.conf" + Get-Content nginx/nginx-1.24.0/conf/nginx.conf + Write-Host "::endgroup::" + + & "nginx/nginx-1.24.0/nginx.exe" -t -p "nginx/nginx-1.24.0" -c "conf/nginx.conf" + + - name: Download NSC Godbolt artifact + uses: actions/download-artifact@v4 + with: + run-id: ${{ inputs.run_id }} + pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image + path: artifact + merge-multiple: true + github-token: ${{ secrets.READ_PAT }} + repository: Devsh-Graphics-Programming/Nabla + + - name: Decompress .tar.zst + run: | + Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object { + $output = $_.FullName -replace '\.zst$', '' + zstd -d "$($_.FullName)" -o "$output" + } + + - name: Load Docker image + run: | + $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1 + docker load -i $image.FullName + + - name: Generate and run Docker Compose with matched image + run: | + $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" | + Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" } | + Select-Object -First 1 + + if (-not $imageName) { + Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" + exit 1 + } + + Write-Host "Found image: $imageName" + + @" + services: + nsc: + container_name: nsc-godbolt + image: $imageName + isolation: process + ports: + - "10240:10240" + volumes: + - type: bind + source: C:\Windows\Globalization\ICU + target: C:\Windows\Globalization\ICU + read_only: true + - type: bind + source: C:\Windows\System32 + target: C:\mount\Windows\System32 + read_only: true + networks: + - docker_default + + networks: + docker_default: + external: true + "@ | Set-Content compose.generated.yml + + docker compose -f compose.generated.yml up -d + + - name: Wait for NSC container response on port + run: | + $maxRetries = 24 + $retryDelay = 5 + $success = $false + + for ($i = 0; $i -lt $maxRetries; $i++) { + try { + $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5 + if ($response.StatusCode -eq 200) { + Write-Host "NSC container is up listening on port 10240 and responding." + $success = $true + break + } else { + Write-Host "Received HTTP $($response.StatusCode), retrying..." + } + } catch { + Write-Host "NSC container is not responding on port 10240, retrying..." + } + Start-Sleep -Seconds $retryDelay + } + + if (-not $success) { + Write-Error "No response from NSC container on port 10240, timeout." + exit 1 + } + + - name: Print NSC container logs + run: | + docker logs nsc-godbolt + + - name: Start Restricted Tunnel + env: + DISCORD_ENABLED: ${{ inputs.withDiscordMSG }} + TUNNEL_DURATION_HOURS: ${{ inputs.tunnelDurationHours }} + run: | + Start-Process -NoNewWindow -FilePath .\nginx\nginx-1.24.0\nginx.exe -ArgumentList '-p', (Join-Path $PWD 'nginx/nginx-1.24.0'), '-c', 'conf/nginx.conf' + Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10241", "--logfile", "cf.log" + netstat -an | findstr 10241 + + $tries = 60 + $url = $null + + while ($tries -gt 0) { + if (Test-Path cf.log) { + $log = Get-Content cf.log + foreach ($line in $log) { + if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') { + $url = $Matches[0] + Write-Host "::notice title=Tunnel URL::$url" + break + } + } + if ($url) { break } + } + Start-Sleep -Seconds 1 + $tries -= 1 + } + + if (-not $url) { + Write-Error "Could not get tunnel URL from cloudflared log" + exit 1 + } + + $webhookUrl = "$env:DISCORD_WEBHOOK" + $thisWorkflowRunID = "${{ github.run_id }}" + $artifactWorkflowRunID = "${{ inputs.run_id }}" + $thisWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$thisWorkflowRunID" + $artifactWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$artifactWorkflowRunID" + $actor = "$env:GITHUB_ACTOR" + $sendDiscord = "$env:DISCORD_ENABLED" -eq "true" + $hours = [int]$env:TUNNEL_DURATION_HOURS + $duration = $hours * 3600 + + Write-Host "Blocking job for $hours hours" + + $description = @" + - tunnel opened for $hours hours, click [here](<$url>) to connect + - requires authentication + - workflow [logs #$thisWorkflowRunID](<$thisWorkflowRunURL>) + - image downloaded from [run #$artifactWorkflowRunID](<$artifactWorkflowRunURL>) + - dispatched by $actor + "@ + + $payload = @{ + embeds = @( + @{ + title = "Running NSC Godbolt Container" + description = $description + color = 15844367 + footer = @{ text = "sent from GitHub Actions runner" } + timestamp = (Get-Date).ToString("o") + } + ) + } | ConvertTo-Json -Depth 10 + + if ($sendDiscord) { + Write-Host "Sending Discord webhook..." + Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload + } else { + Write-Host "Discord webhook disabled" + } + + Start-Sleep -Seconds $duration diff --git a/.gitmodules b/.gitmodules index 8edc1cead9..00482441de 100644 --- a/.gitmodules +++ b/.gitmodules @@ -27,9 +27,6 @@ path = 3rdparty/libexpat url = git@github.com:Devsh-Graphics-Programming/libexpat.git branch = master -[submodule "3rdparty/glm"] - path = 3rdparty/glm - url = git@github.com:AnastaZIuk/glm.git [submodule "3rdparty/freetype2"] path = 3rdparty/freetype2 url = git@github.com:Devsh-Graphics-Programming/freetype.git @@ -54,6 +51,7 @@ [submodule "3rdparty/glTFSampleModels"] path = 3rdparty/glTFSampleModels url = git@github.com:Devsh-Graphics-Programming/glTF-Sample-Models.git + update = none [submodule "3rdparty/nbl_spirv_cross"] path = 3rdparty/nbl_spirv_cross url = git@github.com:devshgraphicsprogramming/SPIRV-Cross.git @@ -89,7 +87,7 @@ url = git@github.com:Devsh-Graphics-Programming/Nabla-Continous-Integration-Python-Framework.git [submodule "3rdparty/boost/superproject"] path = 3rdparty/boost/superproject - url = git@github.com:boostorg/boost.git + url = ../boost.git [submodule "3rdparty/argparse"] path = 3rdparty/argparse url = git@github.com:p-ranav/argparse.git @@ -117,3 +115,9 @@ [submodule "docker/compiler-explorer"] path = docker/compiler-explorer url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git +[submodule "3rdparty/glm"] + path = 3rdparty/glm + url = git@github.com:Devsh-Graphics-Programming/glm.git +[submodule "docker/msvc-winsdk"] + path = docker/msvc-winsdk + url = ../docker-nanoserver-msvc-winsdk diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 56752880ae..5bd2d6859f 100755 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -4,6 +4,9 @@ include(../cmake/common.cmake) +project(Nabla-3rdparty LANGUAGES CXX C) +enable_language(C CXX ASM ASM_NASM) + option(NBL_FORCE_RELEASE_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to Release" OFF) option(NBL_FORCE_RELWITHDEBINFO_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to RelWithDebInfo" OFF) @@ -231,7 +234,7 @@ if(_NBL_COMPILE_WITH_OPEN_EXR_) endif() -#gli +# gli option(_NBL_COMPILE_WITH_GLI_ "Build with GLI library" ON) if(_NBL_COMPILE_WITH_GLI_) set(_OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) @@ -240,12 +243,23 @@ if(_NBL_COMPILE_WITH_GLI_) set(BUILD_SHARED_LIBS OFF) set(BUILD_STATIC_LIBS OFF) set(BUILD_TESTING OFF) + set(GLI_GLM_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/glm") add_subdirectory(gli gli EXCLUDE_FROM_ALL) set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS}) set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS}) set(BUILD_TESTING ${_OLD_BUILD_TESTING}) endif() +set(ENABLE_STATIC_LIB ON) +set(ENABLE_SHARED_LIB OFF) +set(ENABLE_EXAMPLES OFF) +set(ENABLE_DOCS OFF) +set(ENABLE_APP OFF) +set(ENABLE_LIB_ONLY ON) +set(ENABLE_TESTS OFF) +set(ENABLE_SUMMARY OFF) +add_subdirectory(bzip2 bzip2 EXCLUDE_FROM_ALL) + add_library(lzma OBJECT lzma/C/Alloc.c lzma/C/LzFind.c @@ -262,17 +276,6 @@ add_library(lz4 OBJECT lz4/lib/xxhash.c ) - -add_library(bzip2 OBJECT - bzip2/blocksort.c - bzip2/bzlib.c - bzip2/compress.c - bzip2/crctable.c - bzip2/decompress.c - bzip2/huffman.c - bzip2/randtable.c -) - add_library(spirv_cross OBJECT nbl_spirv_cross/spirv_cfg.cpp nbl_spirv_cross/spirv_cross.cpp @@ -419,12 +422,6 @@ add_library(aesGladman OBJECT add_subdirectory(argparse argparse EXCLUDE_FROM_ALL) -option(GLM_TEST_ENABLE_SIMD_SSE4_2 "Enable SSE 4.2 optimizations" ON) -option(GLM_TEST_ENABLE "Build unit tests" OFF) -#add_subdirectory(glm EXCLUDE_FROM_ALL) -set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS}) -set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS}) - if (NBL_BUILD_MITSUBA_LOADER) option(BUILD_tools "EXPAT: build the xmlwf tool for expat library" OFF) option(BUILD_examples "EXPAT: build the examples for expat library" OFF) @@ -465,7 +462,7 @@ set(NBL_3RDPARTY_TARGETS shaderc_util shaderc jpeg-static - bzip2 + bz2_static simdjson nlohmann_json glslang @@ -496,11 +493,7 @@ if (NBL_BUILD_IMGUI) endif() foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS) - if(NBL_DYNAMIC_MSVC_RUNTIME) - set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") if(MSVC AND NBL_SANITIZE_ADDRESS) set_property(TARGET ${trgt} PROPERTY COMPILE_OPTIONS /fsanitize=address) diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt index f3460fe8d6..3c95234b8e 100644 --- a/3rdparty/boost/CMakeLists.txt +++ b/3rdparty/boost/CMakeLists.txt @@ -1,3 +1,81 @@ +get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE) + +# Boost uses it's own tool for generating dependency list for targets, therefore we +# can make sure manually added dependency subdirectories for a library are valid +# https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep + +if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs + if(NOT WIN32) + message(FATAL_ERROR "NBL_BOOST_GENERATE_DEP_LIST only for Windows host!") + endif() + + macro(NBL_BOOST_EXECUTE) + execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject") + endmacro() + + NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE) + string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") + + foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) + string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") + list(APPEND BOOST_SUBMODULES "${CMAKE_MATCH_1}") + endforeach() + + # sync & force update of all boost modules first for the tool purpose (sry guys who use the tool, you need to clone all, I want to keep it simple) + NBL_BOOST_EXECUTE(git submodule sync) + list(APPEND BOOST_FORCE_ALL_CONFIG -c url.https://github.com/.insteadOf=git@github.com:) + foreach(SUBMODULE ${BOOST_SUBMODULES}) + list(APPEND BOOST_FORCE_ALL_CONFIG -c submodule.${SUBMODULE}.update=checkout) + endforeach() + + NBL_BOOST_EXECUTE(git ${BOOST_FORCE_ALL_CONFIG} submodule update --init --recursive -f) + + # build boost dep executable + set(NBL_BOOSTDEP_EXE "boostdep.exe") + set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}") + if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}") + NBL_BOOST_EXECUTE(cmd /C bootstrap.bat) + NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build) + NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}") + NBL_BOOST_EXECUTE(git clean -fdx) + NBL_BOOST_EXECUTE(git reset --hard) + endif() + + # get wave dependency info + NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave + OUTPUT_VARIABLE NBL_OUTPUT_VAR + ) + + file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "${NBL_OUTPUT_VAR}") + + file(STRINGS "${NBL_BOOST_WAVE_DEP_FILE}" NBL_BOOST_LIBS) + set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS}) + list(POP_FRONT NBL_BOOST_LIBS) + list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "#") + list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)") + string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}") + + # update boost .gitmodules configuration, discard all but modules reported by wave + # NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*, + # use when updating boost to more recent version + file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") + + message(STATUS "Updating boost .gitmodules") + foreach(SUBMODULE ${BOOST_SUBMODULES}) + # 1) fallback, ignore all + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update none) + endforeach() + + foreach(NAME ${NBL_BOOST_LIBS}) + string(REPLACE "/" "_" SUBMODULE "${NAME}") + message(STATUS "WAVE BOOST DEP SUBMODULE = ${SUBMODULE}") + # 2) pick only submodules reported by wave + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout) + endforeach() + # 3) and the top module itself + NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.wave.update checkout) +endif() + set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE) get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE) @@ -17,8 +95,6 @@ if(NBL_EMBED_BUILTIN_RESOURCES) ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL") endif() -get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE) - if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}") message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!") endif() @@ -41,47 +117,4 @@ endforeach() set(NBL_BOOST_TARGETS ${NBL_BOOST_TARGETS} -PARENT_SCOPE) - -# Boost uses it's own tool for generating dependency list for targets, therefore we -# can make sure manually added dependnecy subdirectories for a library are valid -# https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep - -if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs - if(WIN32) - set(NBL_BOOSTDEP_EXE "boostdep.exe") - else() - set(NBL_BOOSTDEP_EXE "boostdep") - endif() - - set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}") - - if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}") - macro(NBL_BOOST_EXECUTE) - execute_process(COMMAND ${ARGV} - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject" - ) - endmacro() - - NBL_BOOST_EXECUTE(cmd /C bootstrap.bat) - NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build) - NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}") - NBL_BOOST_EXECUTE(git clean -fdx) - NBL_BOOST_EXECUTE(git reset --hard) - endif() - - execute_process(COMMAND "${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave - OUTPUT_VARIABLE NBL_OUTPUT_VAR - ) - - file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "${NBL_OUTPUT_VAR}") - - file(STRINGS "${NBL_BOOST_WAVE_DEP_FILE}" NBL_BOOST_LIBS) - set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS}) - list(POP_FRONT NBL_BOOST_LIBS) - list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "#") - list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)") - string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}") - - file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})") -endif() +PARENT_SCOPE) \ No newline at end of file diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject index 1c4d3531e4..3b9e116eee 160000 --- a/3rdparty/boost/superproject +++ b/3rdparty/boost/superproject @@ -1 +1 @@ -Subproject commit 1c4d3531e416a1f72b0e6a5e0f7173f93cf97e92 +Subproject commit 3b9e116eeee85ab8fd0d8e5a97364fff5f02eb86 diff --git a/3rdparty/bzip2 b/3rdparty/bzip2 index c4a14bb87e..f4301b0eac 160000 --- a/3rdparty/bzip2 +++ b/3rdparty/bzip2 @@ -1 +1 @@ -Subproject commit c4a14bb87ee395fb2c69ef5dbb50762fe862517e +Subproject commit f4301b0eac69eb109c5419813102be6f82d2b73a diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt index 8b48c0e5a6..9432b4df07 100644 --- a/3rdparty/dxc/CMakeLists.txt +++ b/3rdparty/dxc/CMakeLists.txt @@ -41,6 +41,7 @@ list(APPEND NBL_DXC_CMAKE_OPTIONS "-DSPIRV_SKIP_EXECUTABLES:BOOL=ON") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_DEBUG_ITERATORS:BOOL=ON") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_TOOLS_DIR=${DXC_SPIRV_TOOLS_DIR}") list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_HEADERS_DIR=${DXC_SPIRV_HEADERS_DIR}") +list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_ENABLE_ETW=OFF") if(NOT NBL_IS_MULTI_CONFIG) list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}") @@ -63,11 +64,7 @@ if(WIN32) endif() endif() -if(NBL_DYNAMIC_MSVC_RUNTIME) - list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>DLL") -else() - list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>") -endif() +list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$:Debug>$<$:DLL>") # perform DXC compile standard requirement test set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -89,18 +86,23 @@ endif() set(DXC_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" CACHE INTERNAL "") -if(MSVC AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja Multi-Config" AND NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) - execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" "-Ax64" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS} - RESULT_VARIABLE DXC_CMAKE_RESULT - OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE - ) -else() - execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS} - RESULT_VARIABLE DXC_CMAKE_RESULT - OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE - ) +if(NOT CMAKE_GENERATOR MATCHES "Ninja*") + list(APPEND NBL_DXC_CMAKE_OPTIONS -Ax64) +endif() + +if(CMAKE_GENERATOR_TOOLSET) + list(APPEND NBL_DXC_CMAKE_OPTIONS -T "${CMAKE_GENERATOR_TOOLSET}") endif() +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") +endif() + +execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" ${NBL_DXC_CMAKE_OPTIONS} + RESULT_VARIABLE DXC_CMAKE_RESULT + OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE +) + if(NOT "${DXC_CMAKE_RESULT}" STREQUAL "0") message(FATAL_ERROR "${DXC_CMAKE_STREAM_PIPE}") endif() diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 5ab4d368b6..71f2766da9 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96 +Subproject commit 71f2766da918d33d34fefac270fdee983a06dd20 diff --git a/3rdparty/gli b/3rdparty/gli index 559cbe1ec3..c4e6446d3b 160000 --- a/3rdparty/gli +++ b/3rdparty/gli @@ -1 +1 @@ -Subproject commit 559cbe1ec38878e182507d331e0780fbae5baf15 +Subproject commit c4e6446d3b646538026fd5a95533daed952878d4 diff --git a/3rdparty/glm b/3rdparty/glm index d162eee1e6..2d4c4b4dd3 160000 --- a/3rdparty/glm +++ b/3rdparty/glm @@ -1 +1 @@ -Subproject commit d162eee1e6f7c317a09229fe6ceab8ec6ab9a4b4 +Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f diff --git a/CMakeLists.txt b/CMakeLists.txt index c819c644eb..c6664f8085 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,24 @@ -# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O. # This file is part of the "Nabla Engine". # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h +cmake_minimum_required(VERSION 3.31) -cmake_minimum_required(VERSION 3.29) -cmake_policy(SET CMP0112 NEW) -cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 -cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 +# TODO: Yas - once we deploy 4.x we will fire `cmake_policy(VERSION [...])` instead of manually picking policies +# https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version +# also we should update deps which throw warnings about < 3.10 compatibility + +macro(NBL_POLICY P S) +if(POLICY ${P}) + cmake_policy(SET ${P} ${S}) + set(CMAKE_POLICY_DEFAULT_${P} ${S}) +endif() +endmacro() + +NBL_POLICY(CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0003.html#cmp0003 +NBL_POLICY(CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077 +NBL_POLICY(CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112 +NBL_POLICY(CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141 +NBL_POLICY(CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118 set(NBL_BUILD_ANDROID OFF) @@ -20,29 +33,19 @@ if(MSVC) link_libraries(delayimp) endif() +# TODO: TO BE KILLED, keep both in one tree option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared -option(NBL_DYNAMIC_MSVC_RUNTIME "" ON) + +option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON) option(NBL_SANITIZE_ADDRESS OFF) -if(MSVC) - if(NBL_SANITIZE_ADDRESS) - set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:ProgramDatabase>") - else() - set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$:EditAndContinue>$<$:ProgramDatabase>") - endif() -endif() +set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT $<$:ProgramDatabase>) # ignored on non xMSVC-ABI targets if(NBL_STATIC_BUILD) message(STATUS "Static Nabla build enabled!") else() - if(MSVC) - if(NBL_DYNAMIC_MSVC_RUNTIME) - message(STATUS "Shared Nabla build enabled!") - else() - message(FATAL_ERROR "Turn NBL_DYNAMIC_MSVC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!") - endif() - else() - message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!") + if(NOT NBL_COMPILER_DYNAMIC_RUNTIME) + message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic runtime is mandatory!") endif() endif() diff --git a/CMakePresets.json b/CMakePresets.json index 8d0b62367a..ae56cf1739 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -2,15 +2,14 @@ "version": 6, "cmakeMinimumRequired": { "major": 3, - "minor": 29, - "patch": 2 + "minor": 31, + "patch": 0 }, "configurePresets": [ { "name": "ci-configure-base", "hidden": true, "cacheVariables": { - "NBL_CI_MODE": "ON", "NBL_UPDATE_GIT_SUBMODULE": "OFF", "NBL_COMPILE_WITH_CUDA": "OFF", "NBL_BUILD_OPTIX": "OFF", @@ -19,8 +18,10 @@ "_NBL_COMPILE_WITH_OPEN_EXR_": "ON", "NBL_EXPLICIT_MODULE_LOAD_LOG": "ON", "NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON", - "NBL_RUN_TESTS": "ON", - "NBL_CPACK_CI": "ON" + "NBL_CPACK_CI": "ON", + "GIT_FAIL_IF_NONZERO_EXIT": "OFF", + "NBL_DOCKER_DIND_BUILD": "ON", + "NBL_CE_PUBLISH_PORT": "10240" } }, { @@ -46,7 +47,7 @@ "hidden": true, "inherits": "ci-configure-static-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "OFF" + "NBL_COMPILER_DYNAMIC_RUNTIME": "OFF" }, "condition": { "type": "allOf", @@ -69,7 +70,7 @@ "hidden": true, "inherits": "ci-configure-dynamic-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "ON" + "NBL_COMPILER_DYNAMIC_RUNTIME": "ON" }, "condition": { "type": "allOf", @@ -90,37 +91,35 @@ { "name": "ci-configure-static-msvc", "inherits": "ci-configure-static-windows-base", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Configure as static library with Visual Studio 17 2022 generator and MSVC v143 toolset", - "generator": "Visual Studio 17 2022", - "toolset": "v143" + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake", + "NBL_ENABLE_DOCKER_INTEGRATION": "ON" + } }, { "name": "ci-configure-dynamic-msvc", "inherits": "ci-configure-dynamic-windows-base", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Configure as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset", - "generator": "Visual Studio 17 2022", - "toolset": "v143" + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake", + "NBL_ENABLE_DOCKER_INTEGRATION": "ON" + } }, { - "name": "ci-configure-static-ninja-multi", + "name": "ci-configure-static-clangcl", "inherits": "ci-configure-static-windows-base", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Configure as static library with Ninja multi-config generator", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake" } }, { - "name": "ci-configure-dynamic-ninja-multi", + "name": "ci-configure-dynamic-clangcl", "inherits": "ci-configure-dynamic-windows-base", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Configure as dynamic library with Ninja multi-config generator", "generator": "Ninja Multi-Config", "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake" } }, { @@ -156,7 +155,7 @@ "hidden": true, "inherits": "user-configure-static-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "OFF" + "NBL_COMPILER_DYNAMIC_RUNTIME": "OFF" }, "condition": { "type": "equals", @@ -169,7 +168,7 @@ "hidden": true, "inherits": "user-configure-dynamic-base", "cacheVariables": { - "NBL_DYNAMIC_MSVC_RUNTIME": "ON" + "NBL_COMPILER_DYNAMIC_RUNTIME": "ON" }, "condition": { "type": "equals", @@ -193,6 +192,22 @@ "generator": "Visual Studio 17 2022", "toolset": "v143" }, + { + "name": "user-configure-static-clangcl", + "inherits": "user-configure-static-windows-base", + "displayName": "[USER]: Static library target, Visual Studio 17 2022 generator, ClangCL toolset", + "description": "Configure as static library with Visual Studio 17 2022 generator and ClangCL toolset", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL" + }, + { + "name": "user-configure-dynamic-clangcl", + "inherits": "user-configure-dynamic-windows-base", + "displayName": "[USER]: Dynamic library target, Visual Studio 17 2022 generator, ClangCL toolset", + "description": "Configure as dynamic library with Visual Studio 17 2022 generator and ClangCL toolset", + "generator": "Visual Studio 17 2022", + "toolset": "ClangCL" + }, { "name": "user-configure-static-ninja-multi", "inherits": "user-configure-static-windows-base", @@ -303,8 +318,6 @@ "configurePreset": "ci-configure-static-msvc", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Build Nabla as static library with Visual Studio 17 2022 generator and MSVC v143 toolset", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -316,21 +329,17 @@ "configurePreset": "ci-configure-dynamic-msvc", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Build Nabla as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", "rhs": "ON" } }, - { - "name": "ci-build-static-ninja-multi", - "configurePreset": "ci-configure-static-ninja-multi", + { + "name": "ci-build-static-clangcl", + "configurePreset": "ci-configure-static-clangcl", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Build Nabla as static library with Ninja multi-config generator", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -338,12 +347,10 @@ } }, { - "name": "ci-build-dynamic-ninja-multi", - "configurePreset": "ci-configure-dynamic-ninja-multi", + "name": "ci-build-dynamic-clangcl", + "configurePreset": "ci-configure-dynamic-clangcl", "inheritConfigureEnvironment": true, "inherits": "build-windows-base", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Build Nabla as dynamic library with Ninja multi-config generator", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -450,8 +457,6 @@ "name": "ci-package-static-msvc", "inherits": "ci-package-windows-base", "configurePreset": "ci-configure-static-msvc", - "displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Package Nabla as static library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", @@ -462,32 +467,6 @@ "name": "ci-package-dynamic-msvc", "inherits": "ci-package-windows-base", "configurePreset": "ci-configure-dynamic-msvc", - "displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset", - "description": "Package Nabla as dynamic library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-package-static-ninja-multi", - "inherits": "ci-package-windows-base", - "configurePreset": "ci-configure-static-ninja-multi", - "displayName": "[CI]: Static library target, Ninja multi-config generator", - "description": "Package Nabla as static library compiled with Ninja multi-config generator", - "condition": { - "type": "equals", - "lhs": "$env{NBL_CI_MODE}", - "rhs": "ON" - } - }, - { - "name": "ci-package-dynamic-ninja-multi", - "inherits": "ci-package-windows-base", - "configurePreset": "ci-configure-dynamic-ninja-multi", - "displayName": "[CI]: Dynamic library target, Ninja multi-config generator", - "description": "Package Nabla as dynamic library compiled with Ninja multi-config generator", "condition": { "type": "equals", "lhs": "$env{NBL_CI_MODE}", diff --git a/README.md b/README.md index 2b85c9c460..a696846b30 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,15 @@
Click to see the source
-
- Click to see the source -
+ +

+ + Build Status + + License: Apache 2.0 + + Join our Discord +

# Table of Contents diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake index 8bf2a77893..1e67914ae0 100644 --- a/cmake/adjust/flags.cmake +++ b/cmake/adjust/flags.cmake @@ -12,45 +12,173 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to" ) -function(NBL_REQUEST_COMPILE_OPTION_SUPPORT _NBL_COMPILE_OPTION_) - set(NBL_COMPILE_OPTION "${_NBL_COMPILE_OPTION_}") +# https://github.com/Kitware/CMake/blob/05e77b8a27033e6fd086456bd6cef28338ff1474/Modules/Internal/CheckCompilerFlag.cmake#L26C7-L26C42 +# must be cached because parse utility clears locals in the CheckCompilerFlag module +set(CHECK_COMPILER_FLAG_OUTPUT_VARIABLE NBL_COMPILER_FLAG_OUTPUT CACHE INTERNAL "") - foreach(COMPILER IN ITEMS c cxx) +# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG CONFIG COMPILE_OPTIONS LINK_OPTIONS ) +function(NBL_REQUEST_COMPILE_OPTION_SUPPORT) + cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN}) + + set(DEFAULT_COMPILERS c cxx) + set(REQUEST_ALL_OPTIONS_PRESENT True) + + if(NOT IMPL_LANG) + list(APPEND IMPL_LANG ${DEFAULT_COMPILERS}) + endif() + + foreach(COMPILER IN ITEMS ${IMPL_LANG}) string(TOUPPER "${COMPILER}" COMPILER_UPPER) - string(REGEX REPLACE "[-=:;/.]" "_" flag_signature "${NBL_COMPILE_OPTION}") - set(flag_var "__${COMPILER_UPPER}_Flag_${flag_signature}") - - if(COMPILER STREQUAL "c") - check_c_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var}) - elseif(COMPILER STREQUAL "cxx") - check_cxx_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var}) - endif() - - if(${flag_var}) - message(STATUS "Enabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects!") - set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${NBL_COMPILE_OPTION}" PARENT_SCOPE) - else() - message(STATUS "Disabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects! (no support)") - endif() + foreach(WHAT_OPTIONS IN ITEMS IMPL_COMPILE_OPTIONS IMPL_LINK_OPTIONS) + if(NOT ${WHAT_OPTIONS}) + continue() + endif() + + set(IMPL_OPTIONS ${${WHAT_OPTIONS}}) + string(REPLACE IMPL_ "" WHAT_OPTIONS "${WHAT_OPTIONS}") + + foreach(COMPILE_OPTION ${IMPL_OPTIONS}) + if(IMPL_CONFIG) + foreach(CONFIG ${IMPL_CONFIG}) + # TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER "${CONFIG}" CONFIG_UPPER) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}") + endforeach() + else() + set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}") + endif() + endforeach() + + if(IMPL_CONFIG) + foreach(CONFIG ${IMPL_CONFIG}) + string(TOUPPER "${CONFIG}" CONFIG_UPPER) + set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE) + endforeach() + else() + set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE) + endif() + endforeach() endforeach() endfunction() option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON) -option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON) +option(NBL_REQUEST_SSE_AVX2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON) # profiles -if(MSVC) - include("${CMAKE_CURRENT_LIST_DIR}/template/windows/msvc.cmake") -elseif(ANDROID) - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/android.cmake") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/gnu.cmake") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - include("${CMAKE_CURRENT_LIST_DIR}/template/unix/clang.cmake") -else() - message(WARNING "UNTESTED COMPILER DETECTED, EXPECT WRONG OPTIMIZATION FLAGS! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues") -endif() +foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX) + # all list of all known by CMake vendors: + # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html + set(NBL_COMPILER_VENDOR "${CMAKE_${NBL_COMPILER_LANGUAGE}_COMPILER_ID}") + set(NBL_PROFILE_NAME "${NBL_COMPILER_LANGUAGE}_${NBL_COMPILER_VENDOR}") # eg. "cxx_MSVC.cmake" + set(NBL_PROFILE_PATH "${CMAKE_CURRENT_LIST_DIR}/template/vendor/${NBL_PROFILE_NAME}.cmake") + + include("${NBL_PROFILE_PATH}" RESULT_VARIABLE _NBL_FOUND_) + + if(NOT _NBL_FOUND_) + message(WARNING "UNSUPPORTED \"${NBL_COMPILER_LANGUAGE}\" COMPILER LANGUAGE FOR \"${NBL_COMPILER_VENDOR}\" DETECTED, CMAKE CONFIGURATION OR BUILD MAY FAIL AND COMPILE OPTIONS FLAGS WILL NOT BE SET! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues") + continue() + endif() + + # a profile MUST define + + # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS" (configuration dependent) + # - "NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS" (global) + + # a profile MUST NOT define + # - NBL_${WHAT}_OPTIONS + + # note: + # - use NBL_REQUEST_COMPILE_OPTION_SUPPORT in profile to creates those vars + # - include reset utility in profiles to init vars with empty lists + + # TODO: DEFINITIONS for WHAT to unify the API + + foreach(WHAT COMPILE LINK) + set(NBL_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS) + set(NBL_OPTIONS_VAR_VALUE ${${NBL_OPTIONS_VAR_NAME}}) + + if(NOT DEFINED ${NBL_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_OPTIONS_VAR_NAME}\"!") + endif() + + # update map with configuration dependent compile options + foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG) + set(NBL_CONFIGURATION_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS) + set(NBL_CONFIGURATION_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_OPTIONS_VAR_NAME}}) + + if(NOT DEFINED ${NBL_CONFIGURATION_OPTIONS_VAR_NAME}) + message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_OPTIONS_VAR_NAME}\"!") + endif() + + set(NBL_${CONFIGURATION}_${WHAT}_OPTIONS ${NBL_${CONFIGURATION}_${WHAT}_OPTIONS} + # note that "${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}" MUST NOT contain ANY + # $<$> generator expression in order to support our configuration mapping features + $<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}> + ) + endforeach() + + # update map with global compile options + set(NBL_${WHAT}_OPTIONS ${NBL_${WHAT}_OPTIONS} + $<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS}> + ) + endforeach() + + block() + # validate build with a vendor profile, any warning diagnostic = error + # if you hit error it means the profile generates diagnostics due to: + # - an option (compile or link) which doesn't exist (typo? check vendor docs) + # - a set of options which invalidates an option (eg. MSVC's /INCREMENTAL with /LTCG:incremental is invalid, however linker will emit a warning by default + do a fall-back) + # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html#variable:CMAKE_%3CLANG%3E_FLAGS + # https://cmake.org/cmake/help/latest/module/CheckCompilerFlag.html#command:check_compiler_flag + + set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS) + + foreach(CONFIGURATION IN ITEMS Release RelWithDebInfo Debug) + set(CMAKE_TRY_COMPILE_CONFIGURATION ${CONFIGURATION}) + string(TOUPPER "${CONFIGURATION}" CONFIGURATION) + + set(TEST_NAME "NBL_${NBL_COMPILER_LANGUAGE}_LANG_${CONFIGURATION}_BUILD_OPTIONS_SUPPORT") + set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS_${CONFIGURATION}) + + set(COMPILE_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS} ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS}) + set(LINK_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_LINK_OPTIONS}) + set(COMBINED ${COMPILE_OPTIONS} ${LINK_OPTIONS}) + + set(NBL_OUTPUT_FILE "${CMAKE_BINARY_DIR}/.nbl/try-compile/${TEST_NAME}.output") # no hash in output diagnostic file, desired + + string(SHA1 OPTIONS_HASH "${COMBINED}") + string(APPEND TEST_NAME "_HASH_${OPTIONS_HASH}") + + set(FLAG_VAR ${TEST_NAME}) + set(CMAKE_REQUIRED_LINK_OPTIONS ${LINK_OPTIONS}) + string(REPLACE ";" " " CLI_COMPILE_OPTIONS "${COMPILE_OPTIONS}") + + if(NBL_COMPILER_LANGUAGE STREQUAL C) + check_c_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}") + elseif(NBL_COMPILER_LANGUAGE STREQUAL CXX) + check_cxx_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}") + endif() + + if(NOT ${FLAG_VAR}) + if(NOT "${NBL_COMPILER_FLAG_OUTPUT}" STREQUAL "") + file(WRITE "${NBL_OUTPUT_FILE}" "${NBL_COMPILER_FLAG_OUTPUT}") # lock into file, do not cache, must read from the file because of NBL_COMPILER_FLAG_OUTPUT availability (CMake module writes an output only once before a signature flag status is created) + endif() + + if(EXISTS "${NBL_OUTPUT_FILE}") + file(READ "${NBL_OUTPUT_FILE}" NBL_DIAGNOSTICS) + set(NBL_DIAGNOSTICS "Diagnostics:\n${NBL_DIAGNOSTICS}") + else() + set(NBL_DIAGNOSTICS) + endif() + + if(NOT DEFINED NBL_SKIP_BUILD_OPTIONS_VALIDATION) + message(FATAL_ERROR "${TEST_NAME} failed! To skip the validation define \"NBL_SKIP_BUILD_OPTIONS_VALIDATION\". ${NBL_DIAGNOSTICS}") + endif() + endif() + endforeach() + endblock() +endforeach() function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG) macro(NBL_MAP_CONFIGURATION NBL_CONFIG_FROM NBL_CONFIG_TO) @@ -153,37 +281,34 @@ function(nbl_adjust_flags) # global compile options list(APPEND _D_NBL_COMPILE_OPTIONS_ ${NBL_COMPILE_OPTIONS}) - - # per configuration compile options with mapping - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_DEBUG_ITEM_U}_COMPILE_OPTIONS}>) - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_RELEASE_ITEM_U}_COMPILE_OPTIONS}>) - list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$:${NBL_${NBL_MAP_RELWITHDEBINFO_ITEM_U}_COMPILE_OPTIONS}>) - - # configuration mapping properties - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_DEBUG_ITEM_U}>) - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_RELEASE_ITEM_U}>) - string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_RELWITHDEBINFO_ITEM_U}>) + + foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER "${CONFIG}" CONFIG_U) + + # per configuration options with mapping + foreach(WHAT COMPILE LINK) + list(APPEND _D_NBL_${WHAT}_OPTIONS_ $<$:${NBL_${NBL_MAP_${CONFIG_U}_ITEM_U}_${WHAT}_OPTIONS}>) + endforeach() + + # configuration mapping properties + string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$:${NBL_MAP_${CONFIG_U}_ITEM_U}>) + endforeach() set_target_properties(${NBL_TARGET_ITEM} PROPERTIES NBL_CONFIGURATION_MAP "${_D_NBL_CONFIGURATION_MAP_}" COMPILE_OPTIONS "${_D_NBL_COMPILE_OPTIONS_}" + LINK_OPTIONS "${_D_NBL_LINK_OPTIONS_}" ) unset(_D_NBL_CONFIGURATION_MAP_) unset(_D_NBL_COMPILE_OPTIONS_) + unset(_D_NBL_LINK_OPTIONS_) set(MAPPED_CONFIG $>) - if(MSVC) - if(NBL_SANITIZE_ADDRESS) - set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$,$>:ProgramDatabase>") - else() - set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$:EditAndContinue>$<$:ProgramDatabase>") - endif() - endif() - set_target_properties(${NBL_TARGET_ITEM} PROPERTIES - MSVC_DEBUG_INFORMATION_FORMAT "${NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT}" - ) + MSVC_DEBUG_INFORMATION_FORMAT $<$,$>:ProgramDatabase> # ignored on non xMSVC-ABI targets + ) + math(EXPR _NBL_ARG_I_ "${_NBL_ARG_I_} + 1") endwhile() else() # DIRECTORY mode diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake new file mode 100644 index 0000000000..2cc877c028 --- /dev/null +++ b/cmake/adjust/template/vendor/CXX_Clang.cmake @@ -0,0 +1,5 @@ +include_guard(GLOBAL) + +set(LANG CXX) +include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake") +# append unique CXX options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake new file mode 100644 index 0000000000..59f4e59cdd --- /dev/null +++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake @@ -0,0 +1,5 @@ +include_guard(GLOBAL) + +set(LANG CXX) +include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake") +# append unique CXX options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake new file mode 100644 index 0000000000..046ccaa902 --- /dev/null +++ b/cmake/adjust/template/vendor/C_Clang.cmake @@ -0,0 +1,5 @@ +include_guard(GLOBAL) + +set(LANG C) +include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake") +# append unique C options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/C_MSVC.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake new file mode 100644 index 0000000000..f9aca4a5b7 --- /dev/null +++ b/cmake/adjust/template/vendor/C_MSVC.cmake @@ -0,0 +1,5 @@ +include_guard(GLOBAL) + +set(LANG C) +include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake") +# append unique C options here \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake new file mode 100644 index 0000000000..0b00294411 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/Clang.cmake @@ -0,0 +1,109 @@ +include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake") + +# vendor template with options fitting for both C and CXX LANGs + +if(NOT DEFINED LANG) + message(FATAL_ERROR "LANG must be defined!") +endif() + +if(NBL_WITH_COMPILER_CRASH_DIAGNOSTICS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + # use it to make a repro and attach to an issue if you Clang crashes + # - it outputs preprocessed cpp files with sh script for compilation + -fcrash-diagnostics=compiler + -fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report + ) +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + -Xclang=-fconstexpr-backtrace-limit=696969 + -Xclang=-fconstexpr-depth=696969 + -Xclang=-fconstexpr-steps=696969 + -Xclang=-ftemplate-backtrace-limit=0 # no limit + -Xclang=-ftemplate-depth=696969 + -Xclang=-fmacro-backtrace-limit=0 # no limit + -Xclang=-fspell-checking-limit=0 # no limit + -Xclang=-fcaret-diagnostics-max-lines=0 # no limit + + # latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson) + # TODO: Yas, we should first do independent check if host has the flags, if the request fail then + # do not promote simdjson to build with HASWELL implementation because those flags + avx2 compose + # subset it wants in this case + + ################ + # TODO: (****) -> + -mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi + -mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt + -mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul + ################ <- + + -Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning + -maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes + -mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath + + # TODO: Yas, eliminate all below + -fno-strict-aliasing + -Wno-sequence-point + -Wno-c++98-compat + -Wno-c++98-compat-pedantic + -Wno-padded + -Wno-unsafe-buffer-usage + -Wno-switch-enum + -Wno-error=ignored-attributes + -Wno-unused-parameter + -Wno-unused-but-set-parameter + -Wno-error=unused-function + -Wno-error=unused-variable + -Wno-error=unused-parameter + -Wno-error=ignored-attributes + -Wno-error=non-pod-varargs +) + +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + -msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2 +) # TODO: (****) optional but then adjust 3rdparty options on fail +endif() + +if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC) + # ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not) + include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake") + return() +else() + if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + -mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2 + ) # TODO: (****) + endif() + + if(NBL_SANITIZE_ADDRESS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=address) + endif() + + if(NBL_SANITIZE_THREAD) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=thread) + endif() + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning + -gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only + -Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS + -O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -DNDEBUG + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS + -g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g + -O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg + -Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions + -mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible + -DNDEBUG + ) +endif() \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake new file mode 100644 index 0000000000..803adb1754 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/MSVC.cmake @@ -0,0 +1,10 @@ +include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake") +include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake") + +# vendor template with options fitting for both C and CXX LANGs + +if(NBL_REQUEST_SSE_4_2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 +) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail +endif() \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake new file mode 100644 index 0000000000..06ab606104 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake @@ -0,0 +1,68 @@ +# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT +# vendor frontend template with options fitting for both C and CXX LANGs + +if(NOT DEFINED LANG) + message(FATAL_ERROR "LANG must be defined!") +endif() + +if(NBL_REQUEST_SSE_AVX2) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 +) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170 + /Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170 + /Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170 + /fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170 + /MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170 +) + +if(NBL_SANITIZE_ADDRESS) + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS + /fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170 + ) + + NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + /RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170 + ) +endif() + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS + /Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 +) + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + /GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170 + /GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 + /LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG) +) + +NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS + /O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170 + /Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170 + /Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170 + /DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170 + /GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170 + /Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170 + /sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170 + + LINK_OPTIONS + /INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback) + /LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 +) \ No newline at end of file diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake new file mode 100644 index 0000000000..fc1230f326 --- /dev/null +++ b/cmake/adjust/template/vendor/impl/reset.cmake @@ -0,0 +1,10 @@ +# init profiles vars by resetting required lists + +foreach(LANG CXX C) + foreach(WHAT COMPILE LINK DEFINITIONS) + set(NBL_${LANG}_${WHAT}_OPTIONS "") + foreach(CONFIG RELEASE RELWITHDEBINFO DEBUG) + set(NBL_${LANG}_${CONFIG}_${WHAT}_OPTIONS "") + endforeach() + endforeach() +endforeach() \ No newline at end of file diff --git a/cmake/adjust/template/windows/msvc.cmake b/cmake/adjust/template/windows/msvc.cmake deleted file mode 100644 index 0f9fe365ee..0000000000 --- a/cmake/adjust/template/windows/msvc.cmake +++ /dev/null @@ -1,75 +0,0 @@ -include_guard(GLOBAL) - -# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170 - -# The default instruction set is SSE2 if no /arch option is specified. -if(NBL_REQUEST_SSE_4_2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2") -endif() - -# Enables Intel Advanced Vector Extensions 2. -if(NBL_REQUEST_SSE_AXV2) - NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2") -endif() - -NBL_REQUEST_COMPILE_OPTION_SUPPORT(/Zc:preprocessor) - -# Debug -set(NBL_C_DEBUG_COMPILE_OPTIONS - /Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1) -endif() - -set(NBL_CXX_DEBUG_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_DEBUG_COMPILE_OPTIONS} -) - -set(NBL_DEBUG_COMPILE_OPTIONS - $<$:${NBL_CXX_DEBUG_COMPILE_OPTIONS}> - $<$:${NBL_C_DEBUG_COMPILE_OPTIONS}> -) - -# Release -set(NBL_C_RELEASE_COMPILE_OPTIONS - /O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast -) -set(NBL_CXX_RELEASE_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_RELEASE_COMPILE_OPTIONS} -) - -set(NBL_RELEASE_COMPILE_OPTIONS - $<$:${NBL_CXX_RELEASE_COMPILE_OPTIONS}> - $<$:${NBL_C_RELEASE_COMPILE_OPTIONS}> -) - -# RelWithDebInfo -set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS - /O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast -) -set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS - /Zc:__cplusplus ${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS} -) - -set(NBL_RELWITHDEBINFO_COMPILE_OPTIONS - $<$:${NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS}> - $<$:${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}> -) - -if(NBL_SANITIZE_ADDRESS) - list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address) - list(APPEND NBL_CXX_COMPILE_OPTIONS ${NBL_C_COMPILE_OPTIONS}) -endif() - -set(NBL_COMPILE_OPTIONS - $<$:${NBL_CXX_COMPILE_OPTIONS}> - $<$:${NBL_C_COMPILE_OPTIONS}> -) - -# this should also be not part of profile, pasting from old flags-set function temporary -# TODO: use profile - -#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL. -set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental") \ No newline at end of file diff --git a/cmake/common.cmake b/cmake/common.cmake index f0d3e27f36..69a0a5b980 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -25,7 +25,7 @@ function(nbl_handle_dll_definitions _TARGET_ _SCOPE_) message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!") endif() - if(NBL_DYNAMIC_MSVC_RUNTIME) + if(NBL_COMPILER_DYNAMIC_RUNTIME) set(_NABLA_OUTPUT_DIR_ "${NBL_ROOT_PATH_BINARY}/src/nbl/$/devshgraphicsprogramming.nabla") target_compile_definitions(${_TARGET_} ${_SCOPE_} @@ -43,11 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_) message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!") endif() - if(NBL_DYNAMIC_MSVC_RUNTIME) - set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") endfunction() # Macro creating project for an executable @@ -73,14 +69,6 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE add_executable(${EXECUTABLE_NAME} ${NBL_EXECUTABLE_SOURCES}) nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME}) - - if(WIN32 AND MSVC) - if(NBL_DYNAMIC_MSVC_RUNTIME) - target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$") - endif() - - target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:dxcompiler.dll") - endif() endif() nbl_handle_dll_definitions(${EXECUTABLE_NAME} PUBLIC) diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake index 76e3603980..412cdf04e0 100644 --- a/cmake/submodules/update.cmake +++ b/cmake/submodules/update.cmake @@ -1,208 +1,91 @@ -include(ProcessorCount) find_package(Git REQUIRED) -option(NBL_UPDATE_GIT_SUBMODULE "Turn this ON to let CMake update all public submodules for you" ON) -option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "Submodules will be updated with --force flag if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, use with caution - if there are any uncommited files in submodules' working tree they will be removed!" OFF) -option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync initialized submodule paths if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, this is useful when any submodule remote path got modified and you want to apply this modification to your local repository. Turning NBL_FORCE_ON_UPDATE_GIT_SUBMODULE implies this option" OFF) -option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to update private Nabla submodules" OFF) -option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON) -option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF) +option(NBL_UPDATE_GIT_SUBMODULE "Turn ON to update submodules, only public by default" ON) +option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "NBL_UPDATE_GIT_SUBMODULE logic with --force flag" OFF) +option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync submodule URLs" OFF) +option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "NBL_UPDATE_GIT_SUBMODULE logic but includes private submodules, for Nabla devs" OFF) +option(NBL_SUBMODULES_SHALLOW "NBL_UPDATE_GIT_SUBMODULE logic with --depth=1" OFF) -if(NOT DEFINED NBL_ROOT_PATH) +if(NBL_UPDATE_GIT_SUBMODULE) +block() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE) -endif() - -if(NOT DEFINED THIRD_PARTY_SOURCE_DIR) set(THIRD_PARTY_SOURCE_DIR "${NBL_ROOT_PATH}/3rdparty") -endif() -if(NOT DEFINED NBL_ROOT_PATH_BINARY) - set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules") -endif() + if(NOT DEFINED NBL_ROOT_PATH_BINARY) + set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules") + endif() -if(NOT DEFINED NBL_BUILD_EXAMPLES) - set(NBL_BUILD_EXAMPLES ON) -endif() + if(NOT DEFINED NBL_BUILD_EXAMPLES) + set(NBL_BUILD_EXAMPLES ON) + endif() -function(NBL_UPDATE_SUBMODULES) - ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_) - - if(NBL_CI_GIT_SUBMODULES_SHALLOW) - set(NBL_SHALLOW "--depth=1") - else() - set(NBL_SHALLOW "") + # we force HTTPS traffic for all *public* submodules we update from CMake + # NOTE: it *doesn't* rewrite destination URLs after checkout, if you eg. + # clone with SSH you end up with it anyway, this way your private key + # is never involved during CMake configuration, unless you + # use NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE + + # Private refs (*), exclude from public update + list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"Ditt-Reference-Scenes\".update=none) + + unset(NBL_UPDATE_OPTIONS) + + if(NBL_SUBMODULES_SHALLOW) + list(APPEND NBL_UPDATE_OPTIONS --depth=1) endif() - + if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE) - set(NBL_FORCE "--force") - else() - set(NBL_FORCE "") + list(APPEND NBL_UPDATE_OPTIONS --force) + endif() + + if(NOT NBL_BUILD_EXAMPLES) + list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"examples_tests\".update=none) endif() - macro(NBL_WRAPPER_COMMAND_EXCLUSIVE GIT_RELATIVE_ENTRY GIT_SUBMODULE_PATH SHOULD_RECURSIVE EXCLUDE_SUBMODULE_PATHS) - set(EXCLUDE_SUBMODULE_PATHS ${EXCLUDE_SUBMODULE_PATHS}) - set(SHOULD_RECURSIVE ${SHOULD_RECURSIVE}) - - if("${EXCLUDE_SUBMODULE_PATHS}" STREQUAL "") - set(NBL_EXCLUDE "") - else() - foreach(EXCLUDE_SUBMODULE_PATH ${EXCLUDE_SUBMODULE_PATHS}) - string(APPEND NBL_EXCLUDE "-c submodule.\"${EXCLUDE_SUBMODULE_PATH}\".update=none ") - endforeach() - - string(STRIP "${NBL_EXCLUDE}" NBL_EXCLUDE) - endif() - - if(SHOULD_RECURSIVE) - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") - else() - set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}") - endif() - - string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n") - - unset(NBL_EXCLUDE) + macro(NBL_GIT_COMMAND) + execute_process(COMMAND "${GIT_EXECUTABLE}" ${ARGV}) endmacro() - - set(_NBL_UPDATE_SUBMODULES_CMD_NAME_ "nbl-update-submodules") - set(_NBL_UPDATE_SUBMODULES_CMD_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmd") - get_filename_component(_NBL_UPDATE_IMPL_CMAKE_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmake" ABSOLUTE) - - # Proxy script for inclusive submodule updating - string(APPEND NBL_IMPL_SCRIPT "set(NBL_ROOT_PATH \"${NBL_ROOT_PATH}\")\nset(_GIT_SUBMODULES_JOBS_AMOUNT_ ${_GIT_SUBMODULES_JOBS_AMOUNT_})\nset(GIT_EXECUTABLE \"${GIT_EXECUTABLE}\")\nset(NBL_SHALLOW \"${NBL_SHALLOW}\")\nset(NBL_FORCE \"${NBL_FORCE}\")\n\n") - string(APPEND NBL_IMPL_SCRIPT -[=[ -if(NOT DEFINED GIT_RELATIVE_ENTRY) - message(FATAL_ERROR "GIT_RELATIVE_ENTRY must be defined to use this script!") -endif() -if(NOT DEFINED INCLUDE_SUBMODULE_PATHS) - message(FATAL_ERROR "INCLUDE_SUBMODULE_PATHS must be defined to use this script!") -endif() + if(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE) + message(STATUS "Syncing Public submodules") + NBL_GIT_COMMAND(${NBL_CONFIG_SUBMODULE} submodule sync --recursive WORKING_DIRECTORY "${NBL_ROOT_PATH}") + endif() + + message(STATUS "Updating Public submodules") + NBL_GIT_COMMAND(-c fetch.parallel=0 -c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}") -# update an inclusive submodule first -execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}" submodule update --init "${GIT_RELATIVE_ENTRY}") + if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) + # NOTE: your git must be installed with default Git Bash as shell + # otherwise it *may* fail, whether it works depends on your agent setup -if("${INCLUDE_SUBMODULE_PATHS}" STREQUAL "") - set(NBL_SUBMODULE_UPDATE_CONFIG_ENTRY "") -else() - execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" config --file .gitmodules --get-regexp path - OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE - ) + find_package(GitBash REQUIRED) - string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}") - - foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST}) - string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}") - list(APPEND NBL_ALL_SUBMODULES "${CMAKE_MATCH_1}") - endforeach() - - foreach(NBL_SUBMODULE_NAME ${NBL_ALL_SUBMODULES}) - list(FIND INCLUDE_SUBMODULE_PATHS "${NBL_SUBMODULE_NAME}" NBL_FOUND) - - if("${NBL_FOUND}" STREQUAL "-1") - list(APPEND NBL_CONFIG_SETUP_CMD "-c;submodule.${NBL_SUBMODULE_NAME}.update=none") # filter submodules - only those on the INCLUDE_SUBMODULE_PATHS list will be updated when recursive update is requested, all left will be skipped - endif() - endforeach() -endif() - -execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} --recursive ${NBL_SHALLOW} ${NBL_FORCE} - WORKING_DIRECTORY "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" -) -]=] -) - file(WRITE "${_NBL_UPDATE_IMPL_CMAKE_FILE_}" "${NBL_IMPL_SCRIPT}") - - macro(NBL_WRAPPER_COMMAND_INCLUSIVE GIT_RELATIVE_ENTRY INCLUDE_SUBMODULE_PATHS) - string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "\"${CMAKE_COMMAND}\" \"-DGIT_RELATIVE_ENTRY=${GIT_RELATIVE_ENTRY}\" \"-DINCLUDE_SUBMODULE_PATHS=${INCLUDE_SUBMODULE_PATHS}\" -P \"${_NBL_UPDATE_IMPL_CMAKE_FILE_}\"\n") - endmacro() - - if(NBL_UPDATE_GIT_SUBMODULE) - execute_process(COMMAND ${CMAKE_COMMAND} -E echo "All submodules are about to get updated and initialized in repository because NBL_UPDATE_GIT_SUBMODULE is turned ON!") - - include("${THIRD_PARTY_SOURCE_DIR}/boost/dep/wave.cmake") - - macro(NBL_IMPL_INIT_COMMON_SUBMODULES) - # 3rdparty except boost & gltf - set(NBL_3RDPARTY_MODULES_TO_SKIP - 3rdparty/boost/superproject # a lot of submodules we don't use - 3rdparty/glTFSampleModels # more then 2GB waste of space (disk + .gitmodules data) - ) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}") - - # boost's 3rdparties, special case - set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly) - foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT}) - list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET}) - endforeach() - NBL_WRAPPER_COMMAND_INCLUSIVE(3rdparty/boost/superproject "${NBL_BOOST_SUBMODULES_TO_INIT}") - - # tests - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./tests FALSE "") - - # docker - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./docker FALSE "") + macro(NBL_GIT_BASH_COMMAND) + execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" ${ARGV}) endmacro() - - NBL_IMPL_INIT_COMMON_SUBMODULES() - - if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "") - else() - # NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./ci TRUE "") TODO: enable it once we merge Ditt, etc - - # examples and their media - if(NBL_BUILD_EXAMPLES) - NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests FALSE "") - NBL_WRAPPER_COMMAND_EXCLUSIVE(examples_tests ./media FALSE "") - endif() - endif() - - file(WRITE "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}" "${_NBL_UPDATE_SUBMODULES_COMMANDS_}") - - if(WIN32) - if(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL) - set(UPDATE_COMMAND - nbl-update-submodules.cmd - ) - - execute_process(COMMAND ${UPDATE_COMMAND} - WORKING_DIRECTORY "${NBL_ROOT_PATH_BINARY}" - RESULT_VARIABLE _NBL_TMP_RET_CODE_ - ) - else() - find_package(GitBash REQUIRED) - - execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" + + message(STATUS "Updating Private submodules") + string(REPLACE ";" " " NBL_UPDATE_OPTIONS "${NBL_UPDATE_OPTIONS}") + set(LOG_FILE "${NBL_ROOT_PATH_BINARY}/nbl-update-private-submodules.log") + set(BASH_CMD [=[ >&2 echo "" clear -./nbl-update-submodules.cmd 2>&1 | tee nbl-update-submodules.log -sleep 1 +{ + echo "=== $(date) :: Starting private submodule update ===" + git -c submodule.Ditt-Reference-Scenes.update=checkout -C @NBL_ROOT_PATH@/examples_tests/media submodule update --init Ditt-Reference-Scenes @NBL_UPDATE_OPTIONS@ + # more private submodule here + + echo "=== $(date) :: Created @LOG_FILE@ in your build directory. ===" + echo "=== $(date) :: Finished private submodule update ===" +} 2>&1 | tee @LOG_FILE@ clear -tput setaf 2; echo -e "Submodules have been updated! -Created nbl-update-submodules.log in your build directory." ]=] - WORKING_DIRECTORY ${NBL_ROOT_PATH_BINARY} - OUTPUT_VARIABLE _NBL_TMP_OUTPUT_ - RESULT_VARIABLE _NBL_TMP_RET_CODE_ - OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_STRIP_TRAILING_WHITESPACE - ) - - unset(_NBL_TMP_OUTPUT_) - unset(_NBL_TMP_RET_CODE_) - - message(STATUS "Generated \"${NBL_ROOT_PATH_BINARY}/nbl-update-submodules.log\"") - endif() - - message(STATUS "Submodules have been updated!") - else() - execute_process(COMMAND "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}") - endif() - else() - execute_process(COMMAND ${CMAKE_COMMAND} -E echo "NBL_UPDATE_GIT_SUBMODULE is turned OFF therefore submodules won't get updated.") + ) + string(CONFIGURE "${BASH_CMD}" BASH_CMD) + NBL_GIT_BASH_COMMAND("${BASH_CMD}" OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE RES) + file(READ "${LOG_FILE}" LOG_CONTENT) + message(STATUS "${LOG_CONTENT}") endif() -endfunction() - -NBL_UPDATE_SUBMODULES() \ No newline at end of file +endblock() +endif() \ No newline at end of file diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000000..c80bdb4319 --- /dev/null +++ b/compose.yml @@ -0,0 +1,22 @@ +services: + nsc: + container_name: nsc-godbolt + image: ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:latest + isolation: process + ports: + - "80:10240" + volumes: + - type: bind + source: C:\Windows\Globalization\ICU + target: C:\Windows\Globalization\ICU + read_only: true + - type: bind + source: C:\Windows\System32 + target: C:\mount\Windows\System32 + read_only: true + restart: always + +networks: + default: + external: true + name: docker_default diff --git a/docker/.env b/docker/.env deleted file mode 100644 index 623184f422..0000000000 --- a/docker/.env +++ /dev/null @@ -1,2 +0,0 @@ -THIS_PROJECT_WORKING_DIRECTORY=C:\docker -THIS_PROJECT_NABLA_DIRECTORY=C:/Users/ContainerAdministrator/Nabla/bind \ No newline at end of file diff --git a/docker/ci-windows.env b/docker/ci-windows.env new file mode 100644 index 0000000000..ea89ce43c7 --- /dev/null +++ b/docker/ci-windows.env @@ -0,0 +1,2 @@ +NBL_CI_MODE=ON +NBL_CI_BUILD_DIRECTORY=C:\mount\nabla\build-ct \ No newline at end of file diff --git a/docker/compiler-explorer b/docker/compiler-explorer index e7d3e6ce85..45866dfa87 160000 --- a/docker/compiler-explorer +++ b/docker/compiler-explorer @@ -1 +1 @@ -Subproject commit e7d3e6ce85d4b87bd9afadc5b2ba8c268ccbeb51 +Subproject commit 45866dfa8782404fc121f25ce15ad0626b474db0 diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk new file mode 160000 index 0000000000..d91a96faed --- /dev/null +++ b/docker/msvc-winsdk @@ -0,0 +1 @@ +Subproject commit d91a96faede2933ec02a18b94141fbed549929c0 diff --git a/docker/ninja.env b/docker/ninja.env new file mode 100644 index 0000000000..6d52cbd701 --- /dev/null +++ b/docker/ninja.env @@ -0,0 +1 @@ +NINJA_STATUS=[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: \ No newline at end of file diff --git a/examples_tests b/examples_tests index 8c76367c1c..95d8f78465 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb +Subproject commit 95d8f78465e100bb3a926cea412c21891c800b9d diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h index d251dd3077..829d10bcd8 100644 --- a/include/nbl/asset/IAccelerationStructure.h +++ b/include/nbl/asset/IAccelerationStructure.h @@ -59,11 +59,11 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure // build flags, we don't expose flags that don't make sense for certain levels enum class BUILD_FLAGS : uint16_t { - ALLOW_UPDATE_BIT = base_build_flags_t::ALLOW_UPDATE_BIT, - ALLOW_COMPACTION_BIT = base_build_flags_t::ALLOW_COMPACTION_BIT, - PREFER_FAST_TRACE_BIT = base_build_flags_t::PREFER_FAST_TRACE_BIT, - PREFER_FAST_BUILD_BIT = base_build_flags_t::PREFER_FAST_BUILD_BIT, - LOW_MEMORY_BIT = base_build_flags_t::LOW_MEMORY_BIT, + ALLOW_UPDATE_BIT = static_cast(base_build_flags_t::ALLOW_UPDATE_BIT), + ALLOW_COMPACTION_BIT = static_cast(base_build_flags_t::ALLOW_COMPACTION_BIT), + PREFER_FAST_TRACE_BIT = static_cast(base_build_flags_t::PREFER_FAST_TRACE_BIT), + PREFER_FAST_BUILD_BIT = static_cast(base_build_flags_t::PREFER_FAST_BUILD_BIT), + LOW_MEMORY_BIT = static_cast(base_build_flags_t::LOW_MEMORY_BIT), // Synthetic flag we use to indicate that the build data are AABBs instead of triangles, we've taken away the per-geometry choice thanks to: // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-type-03792 GEOMETRY_TYPE_IS_AABB_BIT = 0x1u<<5u, @@ -88,42 +88,62 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure NO_DUPLICATE_ANY_HIT_INVOCATION_BIT = 0x1u<<1u, }; + enum class GeometryType : uint8_t + { + Triangles = 0, + AABBs = 1, + // Later: LSS and friends + Count = 2 + }; + // Note that in Vulkan strides are 64-bit value but restricted to be 32-bit in range - template requires std::is_base_of_v + template requires (!std::is_const_v && std::is_base_of_v) struct Triangles { - using buffer_t = std::remove_const_t; - constexpr static inline bool Host = std::is_same_v; - // we make our life easier by not taking pointers to single matrix values - using transform_t = std::conditional_t>; - - inline bool hasTransform() const - { - if constexpr (Host) - return !core::isnan(transform[0][0]); - else - return bool(transform.buffer); - } - - // optional, only useful for baking model transforms of multiple meshes into one BLAS - transform_t transform = {}; - // vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT` - asset::SBufferBinding vertexData[2] = {{},{}}; - asset::SBufferBinding indexData = {}; - uint32_t maxVertex = 0u; - // type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819 - uint32_t vertexStride = sizeof(float); - E_FORMAT vertexFormat = EF_R32G32B32_SFLOAT; - E_INDEX_TYPE indexType = EIT_UNKNOWN; - core::bitflag geometryFlags = GEOMETRY_FLAGS::NONE; - // TODO: opacity and displacement micromap buffers and shizz + public: + using buffer_t = BufferType; + constexpr static inline GeometryType Type = GeometryType::Triangles; + + constexpr static inline bool HostTransform = std::is_same_v; + // we make our life easier by not taking pointers to single matrix values + using transform_t = std::conditional_t>; + + inline bool hasTransform() const + { + if constexpr (HostTransform) + return !core::isnan(transform[0][0]); + else + return bool(transform.buffer); + } + + // optional, only useful for baking model transforms of multiple meshes into one BLAS + transform_t transform = __transform_initializer(); + // vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT` + asset::SBufferBinding vertexData[2] = {{},{}}; + asset::SBufferBinding indexData = {}; + uint32_t maxVertex = 0u; + // type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819 + uint32_t vertexStride = sizeof(float); + E_FORMAT vertexFormat = EF_R32G32B32_SFLOAT; + E_INDEX_TYPE indexType = EIT_UNKNOWN; + core::bitflag geometryFlags = GEOMETRY_FLAGS::NONE; + // TODO: opacity and displacement micromap buffers and shizz + + private: + constexpr static transform_t __transform_initializer() + { + if constexpr (HostTransform) + return hlsl::float32_t3x4(std::numeric_limits::quiet_NaN()); + return {}; + } }; // - template requires std::is_base_of_v + template requires (!std::is_const_v && std::is_base_of_v) struct AABBs { - using buffer_t = std::remove_const_t; + using buffer_t = BufferType; + constexpr static inline GeometryType Type = GeometryType::AABBs; // for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work asset::SBufferBinding data = {}; diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h index fdb41ed298..aae73fac2a 100644 --- a/include/nbl/asset/IAsset.h +++ b/include/nbl/asset/IAsset.h @@ -94,6 +94,7 @@ class IAsset : virtual public core::IReferenceCounted ET_COMPUTE_PIPELINE = 1ull<<20, //!< asset::ICPUComputePipeline ET_PIPELINE_CACHE = 1ull<<21, //!< asset::ICPUPipelineCache ET_SCENE = 1ull<<22, //!< reserved, to implement later + ET_RAYTRACING_PIPELINE = 1ull << 23, //!< asset::ICPURayTracingPipeline ET_IMPLEMENTATION_SPECIFIC_METADATA = 1ull<<31u, //!< lights, etc. //! Reserved special value used for things like terminating lists of this enum @@ -155,30 +156,37 @@ class IAsset : virtual public core::IReferenceCounted //! inline bool isMutable() const {return m_mutable;} - //! - virtual size_t getDependantCount() const = 0; - inline IAsset* getDependant(const size_t ix) - { - if (ix(this)->getDependant(ix); - return retval; - } + inline void visitDependents(std::function visit) const + { + visitDependents_impl([&visit](const IAsset* dep)->bool + { + if (dep) + return visit(dep); + return true; + }); + } + + inline void visitDependents(std::function visit) + { + assert(isMutable()); + visitDependents([&](const IAsset* dependent) -> bool + { + return visit(const_cast(dependent)); + }); + } + + virtual bool valid() const = 0; protected: inline IAsset() = default; //! Pure virtual destructor to ensure no instantiation NBL_API2 virtual ~IAsset() = 0; - virtual IAsset* getDependant_impl(const size_t ix) = 0; - private: friend IAssetManager; bool m_mutable = true; + + virtual void visitDependents_impl(std::function visit) const = 0; }; template diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h index 9c9af32f7b..a6b148a891 100644 --- a/include/nbl/asset/ICPUAccelerationStructure.h +++ b/include/nbl/asset/ICPUAccelerationStructure.h @@ -135,12 +135,10 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return cp; } - // Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents - inline size_t getDependantCount() const override {return 0;} inline core::blake3_hash_t computeContentHash() const override { - if (!missingContent()) + if (missingContent()) return INVALID_HASH; const bool isAABB = m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT); core::blake3_hasher hasher; @@ -233,11 +231,36 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms; } + inline bool valid() const override + { + if (!validBuildFlags(m_buildFlags)) return false; + + size_t geometryCount = 0; + if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + if (!m_AABBGeoms || m_triangleGeoms) return false; + geometryCount = m_AABBGeoms->size(); + } + else + { + if (!m_triangleGeoms || m_AABBGeoms) return false; + geometryCount = m_triangleGeoms->size(); + } + + // https://registry.khronos.org/vulkan/specs/latest/man/html/vkGetAccelerationStructureBuildSizesKHR.html#VUID-vkGetAccelerationStructureBuildSizesKHR-pBuildInfo-03619 + if (geometryCount == 0) { + if (m_geometryPrimitiveCount && m_geometryPrimitiveCount->size() > 0) return false; + } + else + { + if (!m_geometryPrimitiveCount || m_geometryPrimitiveCount->size() != geometryCount) return false; + } + return true; + } + protected: virtual ~ICPUBottomLevelAccelerationStructure() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - inline void discardContent_impl() override { m_triangleGeoms = nullptr; @@ -251,6 +274,8 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo core::smart_refctd_dynamic_array> m_AABBGeoms = nullptr; core::smart_refctd_dynamic_array m_geometryPrimitiveCount = nullptr; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT; + + inline void visitDependents_impl(std::function visit) const override {} }; class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure @@ -263,9 +288,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA // ICPUTopLevelAccelerationStructure() = default; - // - inline size_t getDependantCount() const override {return m_instances->size();} - // inline auto& getBuildRangeInfo() { @@ -357,18 +379,32 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA return cp; } - protected: - virtual ~ICPUTopLevelAccelerationStructure() = default; - - inline IAsset* getDependant_impl(const size_t ix) override + inline bool valid() const override { - return m_instances->operator[](ix).getBase().blas.get(); + if (!validBuildFlags(m_buildFlags)) return false; + if (!m_instances) return false; + for (const auto& instance : *m_instances) + if (!instance.getBase().blas->valid()) return false; + if (m_buildRangeInfo.instanceCount != m_instances->size()) return false; + // https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03660 + if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false; + return true; } + protected: + virtual ~ICPUTopLevelAccelerationStructure() = default; + private: core::smart_refctd_dynamic_array m_instances = nullptr; hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo; core::bitflag m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT; + + inline void visitDependents_impl(std::function visit) const override + { + if (!m_instances) return; + for (const auto& instance : *m_instances) + if (!visit(instance.getBase().blas.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h index 1b02787597..321cefa33b 100644 --- a/include/nbl/asset/ICPUAnimationLibrary.h +++ b/include/nbl/asset/ICPUAnimationLibrary.h @@ -95,23 +95,16 @@ class ICPUAnimationLibrary final : public IAnimationLibrary, public constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY; inline E_TYPE getAssetType() const override { return AssetType; } + inline bool valid() const override { return true; } - inline size_t getDependantCount() const override {return 3;} + private: - protected: - inline IAsset* getDependant_impl(const size_t ix) override - { - switch (ix) - { - case 0: - return m_keyframeStorageBinding.buffer.get(); - case 1: - return m_timestampStorageBinding.buffer.get(); - default: - break; - } - return m_animationStorageRange.buffer.get(); - } + inline void visitDependents_impl(std::function visit) const override + { + if (!visit(m_keyframeStorageBinding.buffer.get())) return; + if (!visit(m_timestampStorageBinding.buffer.get())) return; + if (!visit(m_animationStorageRange.buffer.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h index 5bb16bd0ac..46105b3c0e 100644 --- a/include/nbl/asset/ICPUBuffer.h +++ b/include/nbl/asset/ICPUBuffer.h @@ -75,8 +75,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed constexpr static inline auto AssetType = ET_BUFFER; inline IAsset::E_TYPE getAssetType() const override final { return AssetType; } - inline size_t getDependantCount() const override { return 0; } - inline core::blake3_hash_t computeContentHash() const override { core::blake3_hasher hasher; @@ -112,12 +110,15 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed return true; } -protected: - inline IAsset* getDependant_impl(const size_t ix) override - { - return nullptr; - } + inline bool valid() const override + { + if (!m_data) return false; + if (!m_mem_resource) return false; + // check if alignment is power of two + return (m_alignment > 0 && !(m_alignment & (m_alignment - 1))); + } +protected: inline void discardContent_impl() override { if (m_data) @@ -136,6 +137,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed discardContent_impl(); } + inline void visitDependents_impl(std::function visit) const override {} + void* m_data; core::smart_refctd_ptr m_mem_resource; size_t m_alignment; diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h index 3819136c98..8634fd8394 100644 --- a/include/nbl/asset/ICPUBufferView.h +++ b/include/nbl/asset/ICPUBufferView.h @@ -28,8 +28,6 @@ class ICPUBufferView : public IBufferView, public IAsset constexpr static inline auto AssetType = ET_BUFFER_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 1;} - ICPUBuffer* getUnderlyingBuffer() { assert(isMutable()); @@ -48,12 +46,24 @@ class ICPUBufferView : public IBufferView, public IAsset m_size = _size; } + inline bool valid() const override + { + if (!m_buffer->valid()) return false; + if (m_offset >= m_buffer->getSize()) return false; + if (m_size <= 0) return false; + if (m_offset >= m_buffer->getSize()) return false; + if (m_size > m_buffer->getSize() - m_offset) return false; + return true; + } + protected: virtual ~ICPUBufferView() = default; - inline IAsset* getDependant_impl(const size_t ix) override + private: + + inline void visitDependents_impl(std::function visit) const override { - return m_buffer.get(); + if (!visit(m_buffer.get())) return; } }; diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h index b9b707d9fc..ffcf78e908 100644 --- a/include/nbl/asset/ICPUComputePipeline.h +++ b/include/nbl/asset/ICPUComputePipeline.h @@ -6,62 +6,92 @@ #include "nbl/asset/ICPUPipeline.h" +#include "nbl/asset/IComputePipeline.h" namespace nbl::asset { //! CPU Version of Compute Pipeline -class ICPUComputePipeline : public ICPUPipeline,1> +class ICPUComputePipeline final : public ICPUPipeline> { - using base_t = ICPUPipeline,1>; + using pipeline_base_t = IComputePipeline; + using base_t = ICPUPipeline>; public: - struct SCreationParams final : IPipeline::SCreationParams - { - SShaderSpecInfo shader; - }; - static core::smart_refctd_ptr create(const SCreationParams& params) + + static core::smart_refctd_ptr create(ICPUPipelineLayout* layout) { - if (!params.layout) - return nullptr; - auto retval = new ICPUComputePipeline(core::smart_refctd_ptr(params.layout)); - if (!retval->setSpecInfo(params.shader)) - { - retval->drop(); - return nullptr; - } + auto retval = new ICPUComputePipeline(layout); return core::smart_refctd_ptr(retval,core::dont_grab); } constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } - - //! - inline size_t getDependantCount() const override {return 2;} - // provide default arg - inline IPipelineBase::SShaderSpecInfo getSpecInfo() const {return base_t::getSpecInfo(hlsl::ShaderStage::ESS_COMPUTE);} + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override + { + if (stage==hlsl::ShaderStage::ESS_COMPUTE) + return {&m_specInfo,1}; + return {}; + } + + inline std::span getSpecInfos(const hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + + inline SShaderSpecInfo& getSpecInfo() + { + return m_specInfo; + } + + inline const SShaderSpecInfo& getSpecInfo() const + { + return m_specInfo; + } + + inline const SCachedCreationParams& getCachedCreationParams() const + { + return pipeline_base_t::getCachedCreationParams(); + } + + inline SCachedCreationParams& getCachedCreationParams() + { + assert(isMutable()); + return m_params; + } + + inline bool valid() const override + { + if (!m_layout) return false; + if (!m_layout->valid()) return false; + return m_specInfo.valid(); + } protected: using base_t::base_t; virtual ~ICPUComputePipeline() = default; - base_t* clone_impl(core::smart_refctd_ptr&& layout) const override - { - return new ICPUComputePipeline(std::move(layout)); - } - - inline IAsset* getDependant_impl(const size_t ix) override + + private: + SShaderSpecInfo m_specInfo; + + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final { - if (ix!=0) - return m_stages[0].shader.get(); - return const_cast(m_layout.get()); + auto newPipeline = new ICPUComputePipeline(layout.get()); + newPipeline->m_specInfo = m_specInfo.clone(depth); + return core::smart_refctd_ptr(newPipeline, core::dont_grab); } - inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override + explicit ICPUComputePipeline(ICPUPipelineLayout* layout): + base_t(layout, {}) + {} + + inline void visitDependents_impl(std::function visit) const override { - return stage!=hlsl::ShaderStage::ESS_COMPUTE ? (-1):0; + if (!visit(m_layout.get())) return; + if (!visit(m_specInfo.shader.get())) return; } }; diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h index 826c54cc39..4247283c0e 100644 --- a/include/nbl/asset/ICPUDescriptorSet.h +++ b/include/nbl/asset/ICPUDescriptorSet.h @@ -47,8 +47,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSetgetTotalBindingCount()+1;} - // inline ICPUDescriptorSetLayout* getLayout() { @@ -79,14 +77,74 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet clone(uint32_t _depth = ~0u) const override; + inline bool valid() const override { + if (!m_layout->valid()) return false; + for (auto type_i = 0u; type_i < static_cast(IDescriptor::E_TYPE::ET_COUNT); type_i++) + { + const auto descriptorType = static_cast(type_i); + const auto descriptorCategory = IDescriptor::GetTypeCategory(descriptorType); + const auto& descriptorRedirect = m_layout->getDescriptorRedirect(descriptorType); + const auto& descriptorInfoArr = m_descriptorInfos[type_i]; + + if (descriptorInfoArr->size() != descriptorRedirect.getTotalCount()) return false; + + auto offset = 0; + for (auto binding_i = 0; binding_i < descriptorRedirect.getBindingCount(); binding_i++) + { + const auto storageIndex = IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t(binding_i); + const auto descriptorCount = descriptorRedirect.getCount(storageIndex); + const auto createFlags = descriptorRedirect.getCreateFlags(storageIndex); + const auto isPartiallyBound = !createFlags.hasFlags(IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT); + for (auto descriptor_i = 0; descriptor_i < descriptorCount; descriptor_i++) + { + const auto& descriptorInfo = descriptorInfoArr->operator[](offset); + + // partiallyBound layout can have null descriptor, otherwise not + if (!isPartiallyBound && !descriptorInfo.desc) return false; + if (descriptorInfo.desc && descriptorInfo.desc->getTypeCategory() != descriptorCategory) return false; + } + } + } + + return true; + } + protected: virtual ~ICPUDescriptorSet() = default; - IAsset* getDependant_impl(size_t ix) override; private: core::smart_refctd_dynamic_array m_descriptorInfos[static_cast(IDescriptor::E_TYPE::ET_COUNT)]; + + inline void visitDependents_impl(std::function visit) const override + { + for (auto i = 0u; i < static_cast(IDescriptor::E_TYPE::ET_COUNT); i++) + { + if (!m_descriptorInfos[i]) continue; + const auto size = m_descriptorInfos[i]->size(); + for (auto desc_i = 0u; desc_i < size; desc_i++) + { + auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get(); + if (!desc) continue; + switch (IDescriptor::GetTypeCategory(static_cast(i))) + { + case IDescriptor::EC_BUFFER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_SAMPLER: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_IMAGE: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_BUFFER_VIEW: + if (!visit(static_cast(desc))) return; + case IDescriptor::EC_ACCELERATION_STRUCTURE: + if (!visit(static_cast(desc))) return; + default: + break; + } + } + } + } }; } diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h index 8f45a789ea..a46bb55808 100644 --- a/include/nbl/asset/ICPUDescriptorSetLayout.h +++ b/include/nbl/asset/ICPUDescriptorSetLayout.h @@ -56,16 +56,23 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout, public constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT; inline E_TYPE getAssetType() const override { return AssetType; } - - inline size_t getDependantCount() const override {return m_immutableSamplers ? m_immutableSamplers->size():0;} + inline bool valid() const override + { + return true; // no modification is possible after creation + } protected: virtual ~ICPUDescriptorSetLayout() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return m_immutableSamplers->operator[](ix).get(); - } + + private: + + inline void visitDependents_impl(std::function visit) const override + { + if (!m_immutableSamplers) return; + for (const auto& sampler : *m_immutableSamplers) + if (!visit(sampler.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h index 2643db7550..acc990f18c 100644 --- a/include/nbl/asset/ICPUGraphicsPipeline.h +++ b/include/nbl/asset/ICPUGraphicsPipeline.h @@ -13,91 +13,127 @@ namespace nbl::asset { -class ICPUGraphicsPipeline final : public ICPUPipeline,5u> +class ICPUGraphicsPipeline final : public ICPUPipeline> { - using pipeline_base_t = IGraphicsPipeline; - using base_t = ICPUPipeline; + using pipeline_base_t = IGraphicsPipeline; + using base_t = ICPUPipeline; public: - struct SCreationParams final : pipeline_base_t::SCreationParams - { - private: - friend class ICPUGraphicsPipeline; - template - inline bool impl_valid(ExtraLambda&& extra) const - { - return pipeline_base_t::SCreationParams::impl_valid(std::move(extra)); - } - }; - static core::smart_refctd_ptr create(const SCreationParams& params) - { - // we'll validate the specialization info later when attempting to set it - if (!params.impl_valid([](const IPipelineBase::SShaderSpecInfo& info)->bool{return true;})) - return nullptr; - auto retval = new ICPUGraphicsPipeline(params); - for (const auto spec : params.shaders) - if (spec.shader) - retval->setSpecInfo(spec); + + static core::smart_refctd_ptr create(ICPUPipelineLayout* layout, ICPURenderpass* renderpass = nullptr) + { + auto retval = new ICPUGraphicsPipeline(layout, renderpass); return core::smart_refctd_ptr(retval,core::dont_grab); - } - - constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; - inline E_TYPE getAssetType() const override { return AssetType; } - - inline size_t getDependantCount() const override - { - auto stageCount = 2; // the layout and renderpass - for (const auto& stage : m_stages) - if (stage.shader) - stageCount++; - return stageCount; - } - - // extras for this class - inline const SCachedCreationParams& getCachedCreationParams() const {return base_t::getCachedCreationParams();} + } + + constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE; + inline E_TYPE getAssetType() const override { return AssetType; } + + inline const SCachedCreationParams& getCachedCreationParams() const + { + return pipeline_base_t::getCachedCreationParams(); + } + inline SCachedCreationParams& getCachedCreationParams() { assert(isMutable()); return m_params; } + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override final + { + const auto stageIndex = stageToIndex(stage); + if (stageIndex != -1) + return { &m_specInfos[stageIndex], 1 }; + return {}; + } + + inline std::span getSpecInfos(const hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + + SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) + { + if (!isMutable()) return nullptr; + const auto stageIndex = stageToIndex(stage); + if (stageIndex != -1) + return &m_specInfos[stageIndex]; + return nullptr; + } + + const SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) const + { + const auto stageIndex = stageToIndex(stage); + if (stageIndex != -1) + return &m_specInfos[stageIndex]; + return nullptr; + } + + inline bool valid() const override + { + if (!m_layout) return false; + if (!m_layout->valid())return false; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 + if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false; + + core::bitflag stagePresence = {}; + for (auto shader_i = 0u; shader_i < m_specInfos.size(); shader_i++) + { + const auto& info = m_specInfos[shader_i]; + if (info.shader) + stagePresence |= indexToStage(shader_i); + } + return hasRequiredStages(stagePresence, m_params.primitiveAssembly.primitiveType); + } + protected: - using base_t::base_t; - ~ICPUGraphicsPipeline() = default; - - base_t* clone_impl(core::smart_refctd_ptr&& layout) const override - { - std::array _shaders; - for (auto i=0; i(m_layout.get()); - if (ix==1) - return m_renderpass.get(); - size_t stageCount = 0; - for (auto& stage : m_stages) - if (stage.shader) - if ((stageCount++)==ix-2) - return stage.shader.get(); - return nullptr; - } - - inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override - { - const auto stageIx = hlsl::findLSB(stage); - if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) - return -1; - return stageIx; - } + using base_t::base_t; + virtual ~ICPUGraphicsPipeline() override = default; + + std::array m_specInfos; + + private: + explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout, ICPURenderpass* renderpass) + : base_t(layout, {}, renderpass) + {} + + static inline int8_t stageToIndex(const hlsl::ShaderStage stage) + { + const auto stageIx = hlsl::findLSB(stage); + if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) + return -1; + return stageIx; + } + + static inline hlsl::ShaderStage indexToStage(const int8_t index) + { + if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT) + return hlsl::ShaderStage::ESS_UNKNOWN; + return static_cast(hlsl::ShaderStage::ESS_VERTEX + index); + } + + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get()); + newPipeline->m_params = m_params; + + for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++) + { + newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth); + } + + return core::smart_refctd_ptr(newPipeline, core::dont_grab); + } + + inline void visitDependents_impl(std::function visit) const override + { + if (!visit(m_layout.get())) return; + if (!visit(m_renderpass.get())) return; + for (const auto& info : m_specInfos) + if (!visit(info.shader.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h index c27cd21b86..fdbf640557 100644 --- a/include/nbl/asset/ICPUImage.h +++ b/include/nbl/asset/ICPUImage.h @@ -45,9 +45,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed constexpr static inline auto AssetType = ET_IMAGE; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - // Do not report buffer as dependant, as we will simply drop it instead of discarding its contents! - inline size_t getDependantCount() const override {return 0;} - core::blake3_hash_t computeContentHash() const override; // Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content @@ -198,12 +195,21 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return true; } + inline bool valid() const override + { + if (!validateCreationParameters(m_creationParams)) return false; + if (info != m_creationParams.format) return false; + if (buffer && !buffer->valid()) return false; + if (regions) + for (const auto& region : *regions) + if (!region.isValid()) return false; + return true; + } + protected: inline ICPUImage(const SCreationParams& _params) : IImage(_params) {} virtual ~ICPUImage() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - inline void discardContent_impl() override { buffer = nullptr; @@ -221,6 +227,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed return _a.imageSubresource.mipLevel < _b.imageSubresource.mipLevel; } }; + + inline void visitDependents_impl(std::function visit) const override + { + } }; } // end namespace nbl::asset diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h index 87df463021..85a0629cc3 100644 --- a/include/nbl/asset/ICPUImageView.h +++ b/include/nbl/asset/ICPUImageView.h @@ -49,9 +49,6 @@ class ICPUImageView final : public IImageView, public IAsset constexpr static inline auto AssetType = ET_IMAGE_VIEW; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } - //! - inline size_t getDependantCount() const override {return 1;} - //! const SComponentMapping& getComponents() const { return params.components; } SComponentMapping& getComponents() @@ -65,13 +62,26 @@ class ICPUImageView final : public IImageView, public IAsset params.subresourceRange.aspectMask = aspect.value; } + inline bool valid() const override + { + if (!validateCreationParameters(params)) return false; + + // image nullptr already checked in validateCreationParameters; + assert(params.image); + if (!params.image->valid()) return false; + + return true; + } + protected: virtual ~ICPUImageView() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - return params.image.get(); - } + private: + + inline void visitDependents_impl(std::function visit) const override + { + if (!visit(params.image.get())) return; + } }; } diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h index a21f5f3f02..df647b14a4 100644 --- a/include/nbl/asset/ICPUMesh.h +++ b/include/nbl/asset/ICPUMesh.h @@ -81,14 +81,24 @@ class ICPUMesh final : public IMesh, public IAsset return cp; } - //! CLASS IS DEPRECATED ANYWAY - inline size_t getDependantCount() const override {return 0;} + inline bool valid() const override + { + for (const auto& meshBuffer : m_meshBuffers) + { + if (!meshBuffer) return false; + if (!meshBuffer->valid()) return false; + } + return true; + } protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} private: core::vector> m_meshBuffers; + + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h index 532b622090..aa6cbc9429 100644 --- a/include/nbl/asset/ICPUMeshBuffer.h +++ b/include/nbl/asset/ICPUMeshBuffer.h @@ -610,12 +610,15 @@ class ICPUMeshBuffer final : public IMeshBuffer(const_cast(this)->getJointAABBs()); } + inline bool valid() const override + { + return true; + } - //! CLASS IS DEPRECATED ANYWAY - inline size_t getDependantCount() const override {return 0;} - - protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} + private: + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h index d1693f18eb..e9442e0b8c 100644 --- a/include/nbl/asset/ICPUPipeline.h +++ b/include/nbl/asset/ICPUPipeline.h @@ -13,38 +13,98 @@ namespace nbl::asset { -// Common Base class for pipelines -template -class ICPUPipeline : public IAsset, public PipelineNonAssetBase +class ICPUPipelineBase { - using this_t = ICPUPipeline; - public: - inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final + struct SShaderSpecInfo { - core::smart_refctd_ptr layout; - if (_depth>0u && PipelineNonAssetBase::m_layout) - layout = core::smart_refctd_ptr_static_cast(PipelineNonAssetBase::m_layout->clone(_depth-1u)); + //! Structure specifying a specialization map entry + /* + Note that if specialization constant ID is used + in a shader, \bsize\b and \boffset'b must match + to \isuch an ID\i accordingly. + + By design the API satisfies: + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 + */ + //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. + using spec_constant_id_t = uint32_t; + + using SSpecConstantValue = core::vector; + + inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID) + { + const auto found = entries.find(_specConstID); + if (found != entries.end() && found->second.size()) return &found->second; + else return nullptr; + } + + static constexpr int32_t INVALID_SPEC_INFO = -1; + inline int32_t valid() const + { + if (!shader) return INVALID_SPEC_INFO; - auto cp = clone_impl(std::move(layout)); - for (auto i=0; i 0x7fffffff) return INVALID_SPEC_INFO; + return static_cast(specData); + } + + core::smart_refctd_ptr shader = nullptr; + std::string entryPoint = ""; + + IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + + using spec_constant_map_t = core::unordered_map; + // Container choice implicitly satisfies: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 + spec_constant_map_t entries; + // By requiring Nabla Core Profile features we implicitly satisfy: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 + // Also because our API is sane, it satisfies the following by construction: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + + SShaderSpecInfo clone(uint32_t depth) const { - const auto shader = m_stages[i].shader; - if (shader) + auto newSpecInfo = *this; + if (newSpecInfo.shader.get() != nullptr && depth > 0u) { - auto stageInfo = m_stages[i].info; - core::smart_refctd_ptr newShader; - if (_depth>0u) - { - newShader = core::smart_refctd_ptr_static_cast(shader->clone(_depth-1u)); - stageInfo.shader = newShader.get(); - } - cp->setSpecInfo(stageInfo); + newSpecInfo.shader = core::smart_refctd_ptr_static_cast(this->shader->clone(depth - 1u)); } + return newSpecInfo; } + }; - return core::smart_refctd_ptr(cp,core::dont_grab); - } + virtual std::span getSpecInfos(const hlsl::ShaderStage stage) const = 0; + +}; + +// Common Base class for pipelines +template + requires (std::is_base_of_v, PipelineNonAssetBase> && !std::is_base_of_v) +class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase +{ + using this_t = ICPUPipeline; + + public: // extras for this class ICPUPipelineLayout* getLayout() @@ -60,82 +120,34 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase PipelineNonAssetBase::m_layout = std::move(_layout); } - // The getters are weird because the shader pointer, spec constant map and entry point needs patching - inline IShader* getShader(const hlsl::ShaderStage stage) - { - assert(isMutable()); - return const_cast(getSpecInfo(stage).shader); - } - inline std::string* getEntryPoint(const hlsl::ShaderStage stage) - { - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return {}; - return &m_stages[stageIx].entryPoint; - } - inline IPipelineBase::SShaderSpecInfo::spec_constant_map_t* getSpecConstantMap(const hlsl::ShaderStage stage) + inline core::smart_refctd_ptr clone(uint32_t _depth = ~0u) const override final { - assert(isMutable()); - return const_cast(getSpecInfo(stage).entries); + if (!getLayout()) return nullptr; + + core::smart_refctd_ptr layout; + if (_depth > 0u) + layout = core::smart_refctd_ptr_static_cast(getLayout()->clone(_depth - 1u)); + + return clone_impl(std::move(layout), _depth); } - // - inline IPipelineBase::SShaderSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const - { - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return {}; - return m_stages[stageIx].info; - } - inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info) - { - assert(isMutable()); - const int64_t specSize = info.valid(); - if (specSize<0) - return false; - const auto stageIx = stageToIndex(info.stage); - if (stageIx<0) - return false; - auto& outStage = m_stages[stageIx]; - outStage.info = info; - outStage.entryPoint = info.entryPoint; - outStage.shader = core::smart_refctd_ptr(const_cast(info.shader)); - outStage.info.shader = outStage.shader.get(); - auto& outEntries = outStage.entries; - if (specSize>0) - { - outEntries = std::make_unique(); - outEntries->reserve(info.entries->size()); - std::copy(info.entries->begin(),info.entries->end(),std::insert_iterator(*outEntries,outEntries->begin())); - } - else - outEntries = nullptr; - outStage.info.entries = outEntries.get(); - return true; - } - inline bool clearStage(const hlsl::ShaderStage stage) + + // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later. + inline std::span getSpecInfos(const hlsl::ShaderStage stage) { - assert(isMutable()); - const auto stageIx = stageToIndex(stage); - if (stageIx<0) - return false; - m_stages[stageIx] = {}; - return true; + if (!isMutable()) return {}; + const this_t* constPipeline = const_cast(this); + const ICPUPipelineBase* basePipeline = constPipeline; + const auto specInfo = basePipeline->getSpecInfos(stage); + return { const_cast(specInfo.data()), specInfo.size() }; } protected: + using PipelineNonAssetBase::PipelineNonAssetBase; virtual ~ICPUPipeline() = default; + + virtual core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const = 0; - virtual this_t* clone_impl(core::smart_refctd_ptr&& layout) const = 0; - virtual int8_t stageToIndex(const hlsl::ShaderStage stage) const = 0; - - struct ShaderStage - { - std::string entryPoint = {}; - core::smart_refctd_ptr shader = {}; - std::unique_ptr entries = {}; - IPipelineBase::SShaderSpecInfo info = {}; - } m_stages[MaxShaderStageCount] = {}; }; } diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h index 0c1d8c17cf..c5511f39bb 100644 --- a/include/nbl/asset/ICPUPipelineCache.h +++ b/include/nbl/asset/ICPUPipelineCache.h @@ -60,8 +60,6 @@ class ICPUPipelineCache final : public IPreHashed return core::make_smart_refctd_ptr(std::move(cache_cp)); } - inline size_t getDependantCount() const override {return 0;} - // inline core::blake3_hash_t computeContentHash() const override { @@ -85,9 +83,12 @@ class ICPUPipelineCache final : public IPreHashed // const auto& getEntries() const {return m_cache;} - protected: - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} + inline bool valid() const override + { + return true; + } + protected: inline void discardContent_impl() override { for (auto& entry : m_cache) @@ -96,6 +97,10 @@ class ICPUPipelineCache final : public IPreHashed private: entries_map_t m_cache; + + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h index c4a76fdea9..b30ecc3e10 100644 --- a/include/nbl/asset/ICPUPipelineLayout.h +++ b/include/nbl/asset/ICPUPipelineLayout.h @@ -30,16 +30,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout&& _layout2, core::smart_refctd_ptr&& _layout3 ) : IPipelineLayout(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {} - // - inline size_t getDependantCount() const override - { - size_t count = 0; - for (auto i=0; ivalid()) return false; + } + return true; + } + protected: virtual ~ICPUPipelineLayout() = default; - inline IAsset* getDependant_impl(const size_t ix) override - { - size_t count = 0; - for (auto i=0; i visit) const override + { + for (auto i = 0; i < m_descSetLayouts.size(); i++) + { + if (!m_descSetLayouts[i]) continue; + if (!visit(m_descSetLayouts[i].get())) return; + } + } + }; } diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h new file mode 100644 index 0000000000..17c53557e1 --- /dev/null +++ b/include/nbl/asset/ICPURayTracingPipeline.h @@ -0,0 +1,153 @@ + +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_ +#define _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_ + +#include "nbl/asset/IRayTracingPipeline.h" +#include "nbl/asset/ICPUPipeline.h" + + +namespace nbl::asset +{ + +//! CPU Version of RayTracing Pipeline +class ICPURayTracingPipeline final : public ICPUPipeline> +{ + using pipeline_base_t = IRayTracingPipeline; + using base_t = ICPUPipeline; + + public: + struct SHitGroupSpecInfos { + core::vector closestHits; + core::vector anyHits; + core::vector intersections; + }; + + static core::smart_refctd_ptr create(const ICPUPipelineLayout* layout) + { + auto retval = new ICPURayTracingPipeline(layout); + return core::smart_refctd_ptr(retval,core::dont_grab); + } + + + + constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE; + inline E_TYPE getAssetType() const override { return AssetType; } + + inline std::span getSpecInfos(const hlsl::ShaderStage stage) const override + { + switch (stage) + { + case hlsl::ShaderStage::ESS_RAYGEN: + return { &m_raygen, 1 }; + case hlsl::ShaderStage::ESS_MISS: + return m_misses; + case hlsl::ShaderStage::ESS_ANY_HIT: + return m_hitGroups.anyHits; + case hlsl::ShaderStage::ESS_CLOSEST_HIT: + return m_hitGroups.closestHits; + case hlsl::ShaderStage::ESS_INTERSECTION: + return m_hitGroups.intersections; + case hlsl::ShaderStage::ESS_CALLABLE: + return m_callables; + + } + return {}; + } + + inline std::span getSpecInfos(const hlsl::ShaderStage stage) + { + return base_t::getSpecInfos(stage); + } + + inline core::vector* getSpecInfoVector(const hlsl::ShaderStage stage) + { + if (!isMutable()) return nullptr; + switch (stage) + { + // raygen is not stored as vector so we can't return it here. Use getSpecInfo + case hlsl::ShaderStage::ESS_MISS: + return &m_misses; + case hlsl::ShaderStage::ESS_ANY_HIT: + return &m_hitGroups.anyHits; + case hlsl::ShaderStage::ESS_CLOSEST_HIT: + return &m_hitGroups.closestHits; + case hlsl::ShaderStage::ESS_INTERSECTION: + return &m_hitGroups.intersections; + case hlsl::ShaderStage::ESS_CALLABLE: + return &m_callables; + + } + return nullptr; + } + + + inline bool valid() const override final + { + if (!m_layout) return false; + if (!m_layout->valid()) return false; + if (m_raygen.valid() == SShaderSpecInfo::INVALID_SPEC_INFO) return false; + return true; + } + + inline const SCachedCreationParams& getCachedCreationParams() const + { + return pipeline_base_t::getCachedCreationParams(); + } + + inline SCachedCreationParams& getCachedCreationParams() { + assert(isMutable()); + return m_params; + } + + protected: + virtual ~ICPURayTracingPipeline() = default; + + private: + + SShaderSpecInfo m_raygen; + core::vector m_misses; + SHitGroupSpecInfos m_hitGroups; + core::vector m_callables; + + explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout) + : base_t(layout, {}) + {} + + inline void visitDependents_impl(std::function visit) const override + { + if (!visit(m_raygen.shader.get()) return; + for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return; + for (const auto& anyHitInfo : self->m_hitGroups.anyHits) if (!visit(anyHitInfo.shader.get())) return; + for (const auto& closestHitInfo : self->m_hitGroups.closestHits) if (!visit(closestHitInfo.shader.get())) return; + for (const auto& intersectionInfo : self->m_hitGroups.intersections) if (!visit(intersectionInfo.shader.get())) return; + for (const auto& callableInfo : self->m_callables) if(!visit(callableInfo.shader.get())) return; + } + + inline core::smart_refctd_ptr clone_impl(core::smart_refctd_ptr&& layout, uint32_t depth) const override final + { + auto newPipeline = new ICPURayTracingPipeline(layout.get()); + newPipeline->m_raygen = m_raygen.clone(depth); + + auto cloneSpecInfos = [depth](const core::vector& specInfos) -> core::vector { + core::vector results; + results.resize(specInfos.size()); + for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++) + results[specInfo_i] = specInfos[specInfo_i].clone(depth); + return results; + }; + newPipeline->m_misses = cloneSpecInfos(m_misses); + newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits); + newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits); + newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections); + newPipeline->m_callables = cloneSpecInfos(m_callables); + + newPipeline->m_params = m_params; + return core::smart_refctd_ptr(newPipeline); + } +}; + +} +#endif diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h index b9cf31d127..daaa5c62b0 100644 --- a/include/nbl/asset/ICPURenderpass.h +++ b/include/nbl/asset/ICPURenderpass.h @@ -38,13 +38,22 @@ class ICPURenderpass : public IRenderpass, public IAsset return ET_RENDERPASS; } - inline size_t getDependantCount() const override {return 0ull;} + inline bool valid() const override + { + // no modification is possible after creation. parameter is validated when creating renderpass + return true; + } protected: inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {} inline ~ICPURenderpass() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} + private: + + inline void visitDependents_impl(std::function visit) const override + { + } + }; } diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h index ed0171d11f..3d67af23d0 100644 --- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h +++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h @@ -19,6 +19,12 @@ namespace nbl::asset class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, public IAsset { public: + struct SCreationParams + { + std::span shaders = {}; + SCachedCreationParams cached = {}; + }; + //(TODO) it is true however it causes DSs to not be cached when ECF_DONT_CACHE_TOP_LEVEL is set which isnt really intuitive constexpr static inline uint32_t DESC_SET_HIERARCHYLEVELS_BELOW = 0u; // TODO: @Crisspl HOW ON EARTH DOES THIS MAKE SENSE!? @@ -66,8 +72,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, _NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE; inline E_TYPE getAssetType() const override { return AssetType; } - inline size_t getDependantCount() const override {return 0;} - // inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();} inline SCachedCreationParams& getCachedCreationParams() @@ -89,9 +93,14 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, m_layout = std::move(_layout); } + inline bool valid() const override + { + return m_layout && m_layout->valid(); + } + #if 0 // The getters are weird because the shader pointer needs patching - inline IShader::SSpecInfo getSpecInfo(const hlsl::ShaderStage stage) + inline IShader::SSpecInfo getSpecInfos(const hlsl::ShaderStage stage) { assert(isMutable()); const auto stageIx = hlsl::findLSB(stage); @@ -99,7 +108,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, return {}; return m_infos[stageIx]; } - inline IShader::SSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const + inline IShader::SSpecInfo getSpecInfos(const hlsl::ShaderStage stage) const { const auto stageIx = hlsl::findLSB(stage); if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1) @@ -137,14 +146,18 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, : IRenderpassIndependentPipeline(params), m_layout(std::move(_layout)) {} virtual ~ICPURenderpassIndependentPipeline() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - core::smart_refctd_ptr m_layout; #if 0 std::array,GRAPHICS_SHADER_STAGE_COUNT> m_shaders = {}; std::array,GRAPHICS_SHADER_STAGE_COUNT> m_entries = {}; std::array m_infos = {}; #endif + + private: + + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h index 27a918afaa..6b2bea5219 100644 --- a/include/nbl/asset/ICPUSampler.h +++ b/include/nbl/asset/ICPUSampler.h @@ -17,8 +17,6 @@ class ICPUSampler : public ISampler, public IAsset protected: virtual ~ICPUSampler() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;} - public: ICPUSampler(const SParams& _params) : ISampler(_params), IAsset() {} @@ -70,8 +68,13 @@ class ICPUSampler : public ISampler, public IAsset constexpr static inline auto AssetType = ET_SAMPLER; inline IAsset::E_TYPE getAssetType() const override { return AssetType; } + inline bool valid() const override { return true; } - inline size_t getDependantCount() const override {return 0;} + private: + + inline void visitDependents_impl(std::function visit) const override + { + } }; } diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h index 6f1c576ed8..1049798268 100644 --- a/include/nbl/asset/ICPUSkeleton.h +++ b/include/nbl/asset/ICPUSkeleton.h @@ -78,14 +78,14 @@ class ICPUSkeleton final : public ISkeleton, public IAsset constexpr static inline auto AssetType = ET_SKELETON; inline E_TYPE getAssetType() const override { return AssetType; } + inline bool valid() const override { return true; } - //! - inline size_t getDependantCount() const override {return 2;} + private: - protected: - inline IAsset* getDependant_impl(const size_t ix) override + inline void visitDependents_impl(std::function visit) const override { - return (ix!=0 ? m_defaultTransforms:m_parentJointIDs).buffer.get(); + if (!visit(m_defaultTransforms.buffer.get())) return; + if (!visit(m_parentJointIDs.buffer.get())) return; } }; diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h new file mode 100644 index 0000000000..ba4d245473 --- /dev/null +++ b/include/nbl/asset/IComputePipeline.h @@ -0,0 +1,40 @@ +#ifndef _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_ +#define _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_ + +#include "nbl/asset/IPipeline.h" + +namespace nbl::asset +{ + +class IComputePipelineBase : public virtual core::IReferenceCounted +{ + public: + + struct SCachedCreationParams final + { + uint8_t requireFullSubgroups = false; + }; +}; + +template +class IComputePipeline : public IPipeline, public IComputePipelineBase +{ + using base_creation_params_t = IPipeline; + + public: + + inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } + + protected: + explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams) + {} + + SCachedCreationParams m_params; + +}; + +} + +#endif diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h index 140b8d7485..48d8abab9e 100644 --- a/include/nbl/asset/IDescriptorSetLayout.h +++ b/include/nbl/asset/IDescriptorSetLayout.h @@ -340,7 +340,8 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase bindings[i].binding = i; bindings[i].type = type; bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE; - bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::ESS_ALL_OR_LIBRARY; + + bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:hlsl::ShaderStage::ESS_ALL_OR_LIBRARY; bindings[i].count = counts ? counts[i]:1u; bindings[i].samplers = nullptr; } @@ -364,7 +365,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase for (uint32_t b = 0u; b < bindingCnt; ++b) { auto bindingNumber = m_descriptorRedirects[t].m_bindingNumbers[b]; - CBindingRedirect::template binding_number_t otherBindingNumber(CBindingRedirect::Invalid); + CBindingRedirect::binding_number_t otherBindingNumber(CBindingRedirect::Invalid); // TODO: std::find instead? for (uint32_t ob = 0u; ob < otherBindingCnt; ++ob) { diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h index 9c78fe1e42..4f4abb89da 100644 --- a/include/nbl/asset/IFramebuffer.h +++ b/include/nbl/asset/IFramebuffer.h @@ -121,7 +121,7 @@ class IFramebuffer return true; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-pAttachments-00884 - if (viewParams.components!=ImageViewType::SComponentMapping()) + if (viewParams.components!=typename ImageViewType::SComponentMapping()) return true; // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-flags-04533 diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h index c59ad51ca9..5b445afae5 100644 --- a/include/nbl/asset/IGraphicsPipeline.h +++ b/include/nbl/asset/IGraphicsPipeline.h @@ -88,78 +88,34 @@ class IGraphicsPipeline : public IPipeline, public IGraphics using renderpass_t = RenderpassType; public: - struct SCreationParams : IPipeline::SCreationParams - { - protected: - using SpecInfo = IPipelineBase::SShaderSpecInfo; - template - inline bool impl_valid(ExtraLambda&& extra) const - { - if (!IPipeline::SCreationParams::layout) - return false; - - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 - if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount()) - return false; - - // TODO: check rasterization samples, etc. - //rp->getCreationParameters().subpasses[i] - - core::bitflag stagePresence = {}; - for (const auto info : shaders) - if (info.shader) - { - if (!extra(info)) - return false; - const auto stage = info.stage; - if (stage>hlsl::ShaderStage::ESS_FRAGMENT) - return false; - if (stagePresence.hasFlags(stage)) - return false; - stagePresence |= stage; - } - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 - if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) - return false; - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730 - if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) - return false; - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889 - if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==EPT_PATCH_LIST)) - return false; - - return true; - } - - public: - inline bool valid() const - { - return impl_valid([](const SpecInfo& info)->bool - { - if (!info.valid()) - return false; - return false; - }); - } - - std::span shaders = {}; - SCachedCreationParams cached = {}; - renderpass_t* renderpass = nullptr; - }; - inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;} - inline const renderpass_t* getRenderpass() const {return m_renderpass.get();} + + static inline bool hasRequiredStages(const core::bitflag& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType) + { + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096 + if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) + return false; + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889 + if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(primitiveType==asset::EPT_PATCH_LIST)) + return false; + return true; + } + protected: - explicit IGraphicsPipeline(const SCreationParams& _params) : - IPipeline(core::smart_refctd_ptr(_params.layout)), - m_params(_params.cached), m_renderpass(core::smart_refctd_ptr(_params.renderpass)) {} + explicit IGraphicsPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, renderpass_t* renderpass) : + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams), m_renderpass(core::smart_refctd_ptr(renderpass)) + {} - SCachedCreationParams m_params; - core::smart_refctd_ptr m_renderpass; + SCachedCreationParams m_params = {}; + core::smart_refctd_ptr m_renderpass = nullptr; }; } diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h index 036a684729..eb54542403 100644 --- a/include/nbl/asset/IPipeline.h +++ b/include/nbl/asset/IPipeline.h @@ -27,249 +27,113 @@ namespace nbl::asset */ class IPipelineBase { - public: - struct SCreationParams - { - protected: - // This is not public to make sure that different pipelines only get the enums they support - enum class FLAGS : uint64_t - { - NONE = 0, // disallowed in maintanance5 - DISABLE_OPTIMIZATIONS = 1<<0, - ALLOW_DERIVATIVES = 1<<1, - - // I can just derive this - //DERIVATIVE = 1<<2, + public: + enum class CreationFlags : uint64_t + { + NONE = 0, // disallowed in maintanance5 + DISABLE_OPTIMIZATIONS = 1 << 0, + ALLOW_DERIVATIVES = 1 << 1, + + // I can just derive this + //DERIVATIVE = 1<<2, + + // Graphics Pipelines only + //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3, + + // Compute Pipelines only + //DISPATCH_BASE = 1<<4, + + // This is for NV-raytracing extension. Now this is done via IDeferredOperation + //DEFER_COMPILE_NV = 1<<5, + + // We use Renderdoc to take care of this for us, + // we won't be parsing the statistics and internal representation ourselves. + //CAPTURE_STATISTICS = 1<<6, + //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7, + + // Will soon be deprecated due to + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854 + FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1 << 8, + EARLY_RETURN_ON_FAILURE = 1 << 9, + + // Will be exposed later with the IPipelineLibrary asset implementation + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //LINK_TIME_OPTIMIZATION = 1<<10, + + // Won't be exposed because we'll introduce Libraries as a separate object/asset-type + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //CREATE_LIBRARY = 1<<11, + + // Ray Tracing Pipelines only + //SKIP_BUILT_IN_PRIMITIVES = 1<<12, + //SKIP_AABBS = 1<<13, + //NO_NULL_ANY_HIT_SHADERS = 1<<14, + //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + //NO_NULL_MISS_SHADERS = 1<<16, + //NO_NULL_INTERSECTION_SHADERS = 1<<17, + + // There is a new Device Generated Commands extension with its own flag that will deprecate this + //INDIRECT_BINDABLE_NV = 1<<18, + + // Ray Tracing Pipelines only + // For debug tools + //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19, + + // Ray Tracing Pipelines only + //ALLOW_MOTION = 1<<20, + + // Graphics Pipelineonly (we don't support subpass shading) + //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21, + //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22, + + // Will be exposed later with the IPipelineLibrary asset implementation + // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 + //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23, + + // Ray Tracing Pipelines only + //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24, + + // Not supported yet, and we will move to dynamic rendering, so this might never be supported + //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25, + //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26, + + // Not Supported Yet + //NO_PROTECTED_ACCESS=1<<27, + //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28, + //DESCRIPTOR_VUFFER_BIT=1<<29, + //PROTECTED_ACCESS_ONLY=1<<30, + }; + using FLAGS = CreationFlags; + + // Nabla requires device's reported subgroup size to be between 4 and 128 + enum class SUBGROUP_SIZE : uint8_t + { + // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform + UNKNOWN = 0, + // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max + VARYING = 1, + // The rest we encode as log2(x) of the required value + REQUIRE_4 = 2, + REQUIRE_8 = 3, + REQUIRE_16 = 4, + REQUIRE_32 = 5, + REQUIRE_64 = 6, + REQUIRE_128 = 7 + }; - // Graphics Pipelines only - //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3, - - // Compute Pipelines only - //DISPATCH_BASE = 1<<4, - - // This is for NV-raytracing extension. Now this is done via IDeferredOperation - //DEFER_COMPILE_NV = 1<<5, - - // We use Renderdoc to take care of this for us, - // we won't be parsing the statistics and internal representation ourselves. - //CAPTURE_STATISTICS = 1<<6, - //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7, - - // Will soon be deprecated due to - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854 - FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1<<8, - EARLY_RETURN_ON_FAILURE = 1<<9, - - // Will be exposed later with the IPipelineLibrary asset implementation - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //LINK_TIME_OPTIMIZATION = 1<<10, - - // Won't be exposed because we'll introduce Libraries as a separate object/asset-type - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //CREATE_LIBRARY = 1<<11, - - // Ray Tracing Pipelines only - //SKIP_BUILT_IN_PRIMITIVES = 1<<12, - //SKIP_AABBS = 1<<13, - //NO_NULL_ANY_HIT_SHADERS = 1<<14, - //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - //NO_NULL_MISS_SHADERS = 1<<16, - //NO_NULL_INTERSECTION_SHADERS = 1<<17, - - // There is a new Device Generated Commands extension with its own flag that will deprecate this - //INDIRECT_BINDABLE_NV = 1<<18, - - // Ray Tracing Pipelines only - // For debug tools - //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19, - - // Ray Tracing Pipelines only - //ALLOW_MOTION = 1<<20, - - // Graphics Pipelineonly (we don't support subpass shading) - //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21, - //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22, - - // Will be exposed later with the IPipelineLibrary asset implementation - // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853 - //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23, - - // Ray Tracing Pipelines only - //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24, - - // Not supported yet, and we will move to dynamic rendering, so this might never be supported - //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25, - //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26, - - // Not Supported Yet - //NO_PROTECTED_ACCESS=1<<27, - //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28, - //DESCRIPTOR_VUFFER_BIT=1<<29, - //PROTECTED_ACCESS_ONLY=1<<30, - }; - }; - - /* - Specialization info contains things such as entry point to a shader, - specialization map entry, required subgroup size, etc. for a blob of SPIR-V - - It also handles Specialization Constants. - - In Vulkan, all shaders get halfway-compiled into SPIR-V and - then then lowered (compiled) into the HW ISA by the Vulkan driver. - Normally, the half-way compile folds all constant values - and optimizes the code that uses them. - - But, it would be nice every so often to have your Vulkan - program sneak into the halfway-compiled SPIR-V binary and - manipulate some constants at runtime. This is what - Specialization Constants are for. - - So A Specialization Constant is a way of injecting an integer - constant into a halfway-compiled version of a shader right - before the lowering and linking when creating a pipeline. - - Without Specialization Constants, you would have to commit - to a final value before the SPIR-V compilation - */ - struct SShaderSpecInfo final - { - //! Structure specifying a specialization map entry - /* - Note that if specialization constant ID is used - in a shader, \bsize\b and \boffset'b must match - to \isuch an ID\i accordingly. - - By design the API satisfies: - https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 - https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 - */ - //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. - using spec_constant_id_t = uint32_t; - struct SSpecConstantValue - { - const void* data = nullptr; - //!< The byte size of the specialization constant value within the supplied data buffer. - uint32_t size = 0; - - inline operator bool() const {return data&&size;} - - auto operator<=>(const SSpecConstantValue&) const = default; - }; - inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const - { - if (!entries) - return { nullptr,0u }; - - const auto found = entries->find(_specConstID); - if (found != entries->end() && bool(found->second)) - return found->second; - else - return { nullptr,0u }; - } - - // Nabla requires device's reported subgroup size to be between 4 and 128 - enum class SUBGROUP_SIZE : uint8_t - { - // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform - UNKNOWN = 0, - // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max - VARYING = 1, - // The rest we encode as log2(x) of the required value - REQUIRE_4 = 2, - REQUIRE_8 = 3, - REQUIRE_16 = 4, - REQUIRE_32 = 5, - REQUIRE_64 = 6, - REQUIRE_128 = 7 - }; - - // - static constexpr int32_t INVALID_SPEC_INFO = -1; - // Returns negative on failure, otherwise the size of the buffer required to reserve for the spec constant data - inline int32_t valid() const - { - if (!shader || hlsl::bitCount(stage)!=1) - return INVALID_SPEC_INFO; - - // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707 - if (entryPoint.empty()) - return INVALID_SPEC_INFO; - - // Shader stages already checked for validity w.r.t. features enabled, during unspec shader creation, only check: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-08988 - if (requireFullSubgroups) - switch (stage) - { - case hlsl::ShaderStage::ESS_COMPUTE: [[fallthrough]]; - case hlsl::ShaderStage::ESS_TASK: [[fallthrough]]; - case hlsl::ShaderStage::ESS_MESH: - break; - default: - return INVALID_SPEC_INFO; - break; - } - // Impossible to efficiently check anything from: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708 - // to: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686 - // and from: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 - // to: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 - - int64_t specData = 0; - if (entries) - for (const auto& entry : *entries) - { - if (!entry.second) - return INVALID_SPEC_INFO; - specData += entry.second.size; - } - if (specData>0x7fffffff) - return INVALID_SPEC_INFO; - return static_cast(specData); - } - - using spec_constant_map_t = core::unordered_map; - - const IShader* shader = nullptr; - // A name of the function where the entry point of an shader executable begins. It's often "main" function. - std::string_view entryPoint = {}; - // stage must be set - hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN; - // there's some padding here - SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement - // Valid only for Compute, Mesh and Task shaders - uint8_t requireFullSubgroups : 1 = false; - // Container choice implicitly satisfies: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 - const spec_constant_map_t* entries = nullptr; - // By requiring Nabla Core Profile features we implicitly satisfy: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 - // Also because our API is sane, it satisfies the following by construction: - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 - }; }; template class IPipeline : public IPipelineBase { - public: - // For now, due to API design we implicitly satisfy a bunch of VUIDs - struct SCreationParams : protected IPipelineBase::SCreationParams - { - public: - const PipelineLayout* layout = nullptr; - }; + public: + inline const PipelineLayout* getLayout() const {return m_layout.get();} - inline const PipelineLayout* getLayout() const {return m_layout.get();} + protected: - protected: - inline IPipeline(core::smart_refctd_ptr&& _layout) - : m_layout(std::move(_layout)) {} + inline IPipeline(core::smart_refctd_ptr&& _layout) + : m_layout(std::move(_layout)) {} - core::smart_refctd_ptr m_layout; + core::smart_refctd_ptr m_layout; }; } diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h index 4bc5ca5dcd..f7252211e1 100644 --- a/include/nbl/asset/IPreHashed.h +++ b/include/nbl/asset/IPreHashed.h @@ -39,84 +39,61 @@ class IPreHashed : public IAsset discardContent_impl(); } - static inline void discardDependantsContents(const std::span roots) - { - struct stack_entry_t - { - IAsset* asset; - size_t childCount = 0; - size_t childrenVisited = 0; - }; - core::stack stack; - core::unordered_set alreadyVisited; - auto push = [&stack,&alreadyVisited](IAsset* node) -> void - { - if (!node) - return; - const auto [dummy,inserted] = alreadyVisited.insert(node); - if (inserted) - stack.push({.asset=node,.childCount=node->getDependantCount()}); - }; - for (const auto& root : roots) - push(root); - while (!stack.empty()) - { - auto& entry = stack.top(); - if (entry.childrenVisitedgetDependant(entry.childrenVisited++); - push(dep); - } - else - { - // post order traversal does discard - auto* isPrehashed = dynamic_cast(entry.asset); - if (isPrehashed) - isPrehashed->discardContent(); - stack.pop(); - } - } - } - static inline bool anyDependantDiscardedContents(const IAsset* root) - { - struct stack_entry_t - { - const IAsset* asset; - size_t childCount = 0; - size_t childrenVisited = 0; - }; - core::stack stack; - core::unordered_set alreadyVisited; - auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool - { - if (!node) - return false; - const auto [dummy,inserted] = alreadyVisited.insert(node); - if (inserted) - { - auto* isPrehashed = dynamic_cast(node); - if (isPrehashed && isPrehashed->missingContent()) - return true; - stack.push({.asset=node,.childCount=node->getDependantCount()}); - } - return false; - }; - if (push(root)) - return true; - while (!stack.empty()) - { - auto& entry = stack.top(); - if (entry.childrenVisitedgetDependant(entry.childrenVisited++); - if (push(dep)) - return true; - } - else - stack.pop(); - } - return false; - } + static inline void discardDependantsContents(const std::span roots) + { + core::vector stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + auto push = [&stack,&alreadyVisited](IAsset* node) -> bool + { + const auto [dummy,inserted] = alreadyVisited.insert(node); + if (inserted) + stack.push_back(node); + return true; + }; + for (const auto& root : roots) + push(root); + while (!stack.empty()) + { + auto* entry = stack.back(); + stack.pop_back(); + entry->visitDependents(push); + // pre order traversal does discard + auto* isPrehashed = dynamic_cast(entry); + if (isPrehashed) + isPrehashed->discardContent(); + } + } + static inline bool anyDependantDiscardedContents(const IAsset* root) + { + core::vector stack; + core::unordered_set alreadyVisited; // whether we have push the node to the stack + bool result = false; + auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool + { + const auto [dummy,inserted] = alreadyVisited.insert(node); + if (inserted) + { + auto* isPrehashed = dynamic_cast(node); + if (isPrehashed && isPrehashed->missingContent()) + { + stack.clear(); + result = true; + return false; + } + stack.push_back(node); + } + return true; + }; + if (!push(root)) + return true; + while (!stack.empty()) + { + auto* entry = stack.back(); + stack.pop_back(); + entry->visitDependents(push); + } + return result; + } protected: inline IPreHashed() = default; diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h index 0bc2d68653..b97d8d7002 100644 --- a/include/nbl/asset/IRayTracingPipeline.h +++ b/include/nbl/asset/IRayTracingPipeline.h @@ -14,35 +14,6 @@ namespace nbl::asset class IRayTracingPipelineBase : public virtual core::IReferenceCounted { public: - struct SShaderGroupsParams - { - struct SIndex - { - constexpr static inline uint32_t Unused = 0xffFFffFFu; - uint32_t index = Unused; - }; - - struct SHitGroup - { - uint32_t closestHit = SIndex::Unused; - uint32_t anyHit = SIndex::Unused; - uint32_t intersection = SIndex::Unused; - }; - - SIndex raygen; - std::span misses; - std::span hits; - std::span callables; - - inline uint32_t getShaderGroupCount() const - { - return 1 + hits.size() + misses.size() + callables.size(); - } - - }; - using SGeneralShaderGroup = SShaderGroupsParams::SIndex; - using SHitShaderGroup = SShaderGroupsParams::SHitGroup; - struct SCachedCreationParams final { uint32_t maxRecursionDepth : 6 = 0; @@ -53,152 +24,36 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted template class IRayTracingPipeline : public IPipeline, public IRayTracingPipelineBase { - using base_creation_params_t = IPipeline::SCreationParams; public: - using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array; - using SHitShaderGroupContainer = core::smart_refctd_dynamic_array; - - struct SCreationParams : base_creation_params_t + #define base_flag(F) static_cast(IPipelineBase::FLAGS::F) + enum class CreationFlags : uint64_t { - public: - #define base_flag(F) static_cast(base_creation_params_t::FLAGS::F) - enum class FLAGS : uint64_t - { - NONE = base_flag(NONE), - DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), - ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), - FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), - EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), - SKIP_BUILT_IN_PRIMITIVES = 1<<12, - SKIP_AABBS = 1<<13, - NO_NULL_ANY_HIT_SHADERS = 1<<14, - NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, - NO_NULL_MISS_SHADERS = 1<<16, - NO_NULL_INTERSECTION_SHADERS = 1<<17, - ALLOW_MOTION = 1<<20, - }; - #undef base_flag - - protected: - using SpecInfo = IPipelineBase::SShaderSpecInfo; - template - inline bool impl_valid(ExtraLambda&& extra) const - { - if (!IPipeline::SCreationParams::layout) - return false; - - for (const auto info : shaders) - { - if (info.shader) - { - if (!extra(info)) - return false; - const auto stage = info.stage; - if ((stage & ~IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING) != 0) - return false; - if (!std::has_single_bit>(stage)) - return false; - } - else - { - // every shader must not be null. use SIndex::Unused to represent unused shader. - return false; - } - } - - auto getShaderStage = [this](size_t index) -> IShader::E_SHADER_STAGE - { - return shaders[index].stage; - }; - - auto isValidShaderIndex = [this, getShaderStage](size_t index, IShader::E_SHADER_STAGE expectedStage, bool is_unused_shader_forbidden) -> bool - { - if (index == SShaderGroupsParams::SIndex::Unused) - return !is_unused_shader_forbidden; - if (index >= shaders.size()) - return false; - if (getShaderStage(index) != expectedStage) - return false; - return true; - }; - - if (!isValidShaderIndex(shaderGroups.raygen.index, IShader::E_SHADER_STAGE::ESS_RAYGEN, true)) - { - return false; - } - - for (const auto& shaderGroup : shaderGroups.hits) - { - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 - if (!isValidShaderIndex(shaderGroup.anyHit, - IShader::E_SHADER_STAGE::ESS_ANY_HIT, - bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS))) - return false; - - // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 - if (!isValidShaderIndex(shaderGroup.closestHit, - IShader::E_SHADER_STAGE::ESS_CLOSEST_HIT, - bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS))) - return false; - - if (!isValidShaderIndex(shaderGroup.intersection, - IShader::E_SHADER_STAGE::ESS_INTERSECTION, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.misses) - { - if (!isValidShaderIndex(shaderGroup.index, - IShader::E_SHADER_STAGE::ESS_MISS, - false)) - return false; - } - - for (const auto& shaderGroup : shaderGroups.callables) - { - if (!isValidShaderIndex(shaderGroup.index, IShader::E_SHADER_STAGE::ESS_CALLABLE, false)) - return false; - } - return true; - } - - public: - inline bool valid() const - { - return impl_valid([](const SpecInfo& info)->bool - { - if (!info.valid()) - return false; - return false; - }); - } - - std::span shaders = {}; - SShaderGroupsParams shaderGroups; - SCachedCreationParams cached = {}; - // TODO: Could guess the required flags from SPIR-V introspection of declared caps - core::bitflag flags = FLAGS::NONE; + NONE = base_flag(NONE), + DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS), + ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES), + FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED), + EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE), + SKIP_BUILT_IN_PRIMITIVES = 1<<12, + SKIP_AABBS = 1<<13, + NO_NULL_ANY_HIT_SHADERS = 1<<14, + NO_NULL_CLOSEST_HIT_SHADERS = 1<<15, + NO_NULL_MISS_SHADERS = 1<<16, + NO_NULL_INTERSECTION_SHADERS = 1<<17, + ALLOW_MOTION = 1<<20, }; + #undef base_flag + using FLAGS = CreationFlags; inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; } protected: - explicit IRayTracingPipeline(const SCreationParams& _params) : - IPipeline(core::smart_refctd_ptr(_params.layout)), - m_params(_params.cached), - m_raygenShaderGroup(_params.shaderGroups.raygen), - m_missShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.misses)), - m_hitShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.hits)), - m_callableShaderGroups(core::make_refctd_dynamic_array(_params.shaderGroups.callables)) + explicit IRayTracingPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) : + IPipeline(core::smart_refctd_ptr(layout)), + m_params(cachedParams) {} SCachedCreationParams m_params; - SGeneralShaderGroup m_raygenShaderGroup; - SGeneralShaderGroupContainer m_missShaderGroups; - SHitShaderGroupContainer m_hitShaderGroups; - SGeneralShaderGroupContainer m_callableShaderGroups; }; diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h index 30be5c99e7..ce41e35573 100644 --- a/include/nbl/asset/IRenderpass.h +++ b/include/nbl/asset/IRenderpass.h @@ -81,11 +81,12 @@ class NBL_API2 IRenderpass { bool valid() const; }; + // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs - constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {}; + static const SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165 const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd; // The arrays pointed to by this array must be terminated by `ColorAttachmentsEnd` value, which implicitly satisfies a few VUIDs - constexpr static inline SColorAttachmentDescription ColorAttachmentsEnd = {}; + static const SColorAttachmentDescription ColorAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165 const SColorAttachmentDescription* colorAttachments = &ColorAttachmentsEnd; struct SSubpassDescription final @@ -199,7 +200,7 @@ class NBL_API2 IRenderpass SColorAttachmentsRef colorAttachments[MaxColorAttachments] = {}; // The arrays pointed to by this array must be terminated by `InputAttachmentsEnd` value - constexpr static inline SInputAttachmentRef InputAttachmentsEnd = {}; + static const SInputAttachmentRef InputAttachmentsEnd; const SInputAttachmentRef* inputAttachments = &InputAttachmentsEnd; struct SPreserveAttachmentRef @@ -232,7 +233,7 @@ class NBL_API2 IRenderpass // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSubpassDescription2.html#VUID-VkSubpassDescription2-pipelineBindPoint-04953 //E_PIPELINE_BIND_POINT pipelineBindPoint : 2 = EPBP_GRAPHICS; }; - constexpr static inline SSubpassDescription SubpassesEnd = {}; + static const SSubpassDescription SubpassesEnd; const SSubpassDescription* subpasses = &SubpassesEnd; struct SSubpassDependency final @@ -258,7 +259,7 @@ class NBL_API2 IRenderpass bool valid() const; }; // The arrays pointed to by this array must be terminated by `DependenciesEnd` value - constexpr static inline SSubpassDependency DependenciesEnd = {}; + static const SSubpassDependency DependenciesEnd; const SSubpassDependency* dependencies = &DependenciesEnd; @@ -379,6 +380,12 @@ class NBL_API2 IRenderpass uint32_t m_loadOpColorAttachmentEnd = ~0u; }; +constexpr inline IRenderpass::SCreationParams::SDepthStencilAttachmentDescription IRenderpass::SCreationParams::DepthStencilAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SColorAttachmentDescription IRenderpass::SCreationParams::ColorAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDescription::SInputAttachmentRef IRenderpass::SCreationParams::SSubpassDescription::InputAttachmentsEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDescription IRenderpass::SCreationParams::SubpassesEnd = {}; +constexpr inline IRenderpass::SCreationParams::SSubpassDependency IRenderpass::SCreationParams::DependenciesEnd = {}; + inline bool IRenderpass::compatible(const IRenderpass* other) const { // If you find yourself spending a lot of time here in your profile, go ahead and implement a precomputed hash and store it in the renderpass @@ -707,7 +714,7 @@ inline bool IRenderpass::SCreationParams::SSubpassDescription::SDepthStencilAtta template inline bool IRenderpass::SCreationParams::SSubpassDescription::SRenderAttachmentsRef::valid(const typename attachment_ref_t::description_t* descs, const uint32_t attachmentCount) const { - if (!render.valid(descs,attachmentCount) || !resolve.valid(descs,attachmentCount)) + if (!render.template valid(descs,attachmentCount) || !resolve.template valid(descs,attachmentCount)) return false; const bool renderUsed = render.used(); if (resolve.used()) diff --git a/include/nbl/asset/IRenderpassIndependentPipeline.h b/include/nbl/asset/IRenderpassIndependentPipeline.h index 7f33b6abc4..feeaff7c99 100644 --- a/include/nbl/asset/IRenderpassIndependentPipeline.h +++ b/include/nbl/asset/IRenderpassIndependentPipeline.h @@ -28,11 +28,6 @@ class IRenderpassIndependentPipeline SRasterizationParams rasterization = {}; SBlendParams blend = {}; }; - struct SCreationParams - { - std::span shaders = {}; - SCachedCreationParams cached = {}; - }; inline const SCachedCreationParams& getCachedCreationParams() const {return m_cachedParams;} diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h index a6dab09b54..96ff73f3f0 100644 --- a/include/nbl/asset/IShader.h +++ b/include/nbl/asset/IShader.h @@ -27,7 +27,7 @@ namespace nbl::asset The purpose for the class is for storing raw HLSL code to be compiled or already compiled (but unspecialized) SPIR-V code. */ -class IShader : public IAsset +class IShader final : public IAsset { public: enum class E_CONTENT_TYPE : uint8_t @@ -50,9 +50,6 @@ class IShader : public IAsset constexpr static inline auto AssetType = ET_SHADER; inline E_TYPE getAssetType() const override { return AssetType; } - // - inline size_t getDependantCount() const override { return 1; } - // inline core::smart_refctd_ptr clone(uint32_t _depth=~0u) const override { @@ -90,17 +87,29 @@ class IShader : public IAsset // TODO: `void setContent(core::smart_refctd_ptr&&,const E_CONTENT_TYPE)` + inline bool valid() const override + { + if (!m_code) return false; + if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false; + return true; + } + // alias for legacy reasons using E_SHADER_STAGE = hlsl::ShaderStage; protected: virtual ~IShader() = default; - inline IAsset* getDependant_impl(const size_t ix) override {return m_code.get();} - std::string m_filepathHint; core::smart_refctd_ptr m_code; E_CONTENT_TYPE m_contentType; + + private: + + inline void visitDependents_impl(std::function visit) const override + { + if (!visit(m_code.get())) return; + } }; } diff --git a/include/nbl/asset/filters/CBlitImageFilter.h b/include/nbl/asset/filters/CBlitImageFilter.h index 1dbc7809ba..f228fea325 100644 --- a/include/nbl/asset/filters/CBlitImageFilter.h +++ b/include/nbl/asset/filters/CBlitImageFilter.h @@ -464,7 +464,7 @@ class CBlitImageFilter : auto phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount.xyz, outExtentLayerCount.xyz, inImageType); phaseCount = hlsl::max(phaseCount,hlsl::uint32_t3(1,1,1)); - const auto axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size); + const auto axisOffsets = blit_utils_t::getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size); constexpr auto MaxAxisCount = 3; lut_value_t* scaledKernelPhasedLUTPixel[MaxAxisCount]; for (auto i = 0; i < MaxAxisCount; ++i) diff --git a/include/nbl/asset/filters/kernels/WeightFunctions.h b/include/nbl/asset/filters/kernels/WeightFunctions.h index bb0b8fb9b4..af2782dfac 100644 --- a/include/nbl/asset/filters/kernels/WeightFunctions.h +++ b/include/nbl/asset/filters/kernels/WeightFunctions.h @@ -337,12 +337,12 @@ class CWeightFunction1D final : public impl::IWeightFunction1Dscale(base_t::value_t(1)/stretchFactor); + this->scale(typename base_t::value_t(1)/stretchFactor); } inline base_t::value_t weight(const float x) const { - return static_cast(this->getTotalScale()*function_t::weight(x*this->getInvStretch())); + return static_cast(this->getTotalScale()*function_t::template weight(x*this->getInvStretch())); } // Integral of `weight(x) dx` from -INF to +INF diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h index 3d6455e020..0d7d678549 100644 --- a/include/nbl/asset/utils/CSPIRVIntrospector.h +++ b/include/nbl/asset/utils/CSPIRVIntrospector.h @@ -208,7 +208,13 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable // `memberStrides[i]` only relevant if `memberTypes[i]->isArray()` inline ptr_t memberStrides() const {return memberOffsets()+memberCount;} using member_matrix_info_t = MatrixInfo; - inline ptr_t memberMatrixInfos() const {return reinterpret_cast&>(memberStrides()+memberCount); } + inline ptr_t memberMatrixInfos() const + { + auto t = memberStrides() + memberCount; + + return reinterpret_cast&>(t); + + } constexpr static inline size_t StoragePerMember = sizeof(member_type_t)+sizeof(member_name_t)+sizeof(member_size_t)+sizeof(member_offset_t)+sizeof(member_stride_t)+sizeof(member_matrix_info_t); @@ -326,8 +332,8 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable template inline std::enable_if_t isLastMemberRuntimeSized() const { - if (type->memberCount) - return type->memberTypes()[type->memberCount-1].count.front().isRuntimeSized(); + if (this->type->memberCount) + return this->type->memberTypes()[this->type->memberCount-1].count.front().isRuntimeSized(); return false; } template @@ -335,9 +341,9 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable { if (isLastMemberRuntimeSized()) { - const auto& lastMember = type->memberTypes()[type->memberCount-1]; + const auto& lastMember = this->type->memberTypes()[this->type->memberCount-1]; assert(!lastMember.count.front().isSpecConstantID); - return sizeWithoutLastMember+lastMemberElementCount*type->memberStrides()[type->memberCount-1]; + return sizeWithoutLastMember+lastMemberElementCount* this->type->memberStrides()[this->type->memberCount-1]; } return sizeWithoutLastMember; } @@ -582,7 +588,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable } // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants) - bool merge(const CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr); + bool merge(const CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr); // core::smart_refctd_dynamic_array createPushConstantRangesFromIntrospection(core::smart_refctd_ptr& introspection); @@ -643,7 +649,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable } //! creates pipeline for a single IShader - core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout=nullptr); + core::smart_refctd_ptr createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout=nullptr); #if 0 // wait until Renderpass Indep completely gone and Graphics Pipeline is used in a new way && Graphics Pipeline Libraries struct CShaderStages diff --git a/include/nbl/asset/utils/ISPIRVDebloater.h b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h similarity index 72% rename from include/nbl/asset/utils/ISPIRVDebloater.h rename to include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h index f5f87956be..a2e24dabab 100644 --- a/include/nbl/asset/utils/ISPIRVDebloater.h +++ b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h @@ -1,5 +1,5 @@ -#ifndef _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_ -#define _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_ +#ifndef _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_ +#define _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_ #include "nbl/core/declarations.h" @@ -10,14 +10,14 @@ namespace nbl::asset { -class ISPIRVDebloater final : public core::IReferenceCounted +class ISPIRVEntryPointTrimmer final : public core::IReferenceCounted { public: - ISPIRVDebloater(); + ISPIRVEntryPointTrimmer(); struct Result { - core::smart_refctd_ptr spirv; // nullptr if there is some entry point not found or spirv does not need to be debloated + core::smart_refctd_ptr spirv; // nullptr if there is some entry point not found or spirv does not need to be trimmed bool isSuccess; inline operator bool() const @@ -45,9 +45,9 @@ class ISPIRVDebloater final : public core::IReferenceCounted } }; - Result debloat(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const; + Result trim(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const; - inline core::smart_refctd_ptr debloat(const IShader* shader, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const + inline core::smart_refctd_ptr trim(const IShader* shader, const core::set& entryPoints, system::logger_opt_ptr logger = nullptr) const { if (shader->getContentType() != IShader::E_CONTENT_TYPE::ECT_SPIRV) { @@ -55,10 +55,10 @@ class ISPIRVDebloater final : public core::IReferenceCounted return nullptr; } const auto buffer = shader->getContent(); - const auto result = debloat(buffer, entryPoints, logger); + const auto result = trim(buffer, entryPoints, logger); if (result && result.spirv.get() == nullptr) { - // when debloat does not happen return original shader + // when trim does not happen return original shader return core::smart_refctd_ptr(shader); } diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl index 262cb3c0c7..9088b0c7b4 100644 --- a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl +++ b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl @@ -1,7 +1,7 @@ #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_ #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_ -#include "nbl/builtin/hlsl/concepts.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" #include "nbl/builtin/hlsl/fft/common.hlsl" namespace nbl @@ -17,49 +17,15 @@ namespace fft // * void set(uint32_t index, in uint32_t value); // * void workgroupExecutionAndMemoryBarrier(); -#define NBL_CONCEPT_NAME FFTSharedMemoryAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, uint32_t) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include - +template +NBL_BOOL_CONCEPT FFTSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; // The Accessor (for a small FFT) MUST provide the following methods: // * void get(uint32_t index, NBL_REF_ARG(complex_t) value); // * void set(uint32_t index, in complex_t value); -#define NBL_CONCEPT_NAME FFTAccessor -#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename) -#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(Scalar) -#define NBL_CONCEPT_PARAM_0 (accessor, T) -#define NBL_CONCEPT_PARAM_1 (index, uint32_t) -#define NBL_CONCEPT_PARAM_2 (val, complex_t) -NBL_CONCEPT_BEGIN(3) -#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 -#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 -#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 -NBL_CONCEPT_END( - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set >(index, val)), is_same_v, void)) - ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get >(index, val)), is_same_v, void)) -); -#undef val -#undef index -#undef accessor -#include +template +NBL_BOOL_CONCEPT FFTAccessor = concepts::accessors::GenericDataAccessor,I>; } } diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl new file mode 100644 index 0000000000..cc22595444 --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl @@ -0,0 +1,79 @@ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace concepts +{ +namespace accessors +{ + +#define NBL_CONCEPT_NAME GenericSharedMemoryAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +#define NBL_CONCEPT_NAME GenericReadAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get(index, val)), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +#define NBL_CONCEPT_NAME GenericWriteAccessor +#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename) +#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I) +#define NBL_CONCEPT_PARAM_0 (accessor, T) +#define NBL_CONCEPT_PARAM_1 (val, V) +#define NBL_CONCEPT_PARAM_2 (index, I) +NBL_CONCEPT_BEGIN(3) +#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0 +#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1 +#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2 +NBL_CONCEPT_END( + ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set(index, val)), is_same_v, void)) +); +#undef val +#undef index +#undef accessor +#include + +template +NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor && GenericWriteAccessor; + +} +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl new file mode 100644 index 0000000000..267342634f --- /dev/null +++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl @@ -0,0 +1,26 @@ +#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_ + +#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +template +NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor; + +template +NBL_BOOL_CONCEPT ArithmeticReadOnlyDataAccessor = concepts::accessors::GenericReadAccessor; + +template +NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl index 679fecb697..431ea625bf 100644 --- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl +++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl @@ -602,7 +602,7 @@ struct nClamp_helper using return_t = T; static inline return_t __call(const T x, const T _min, const T _max) { - return nMin_helper::_call(nMax_helper::_call(x, _min), _max); + return nMin_helper::_call(nMax_helper::_call(x, _min), _max); } }; diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl index 99ec0736a4..2194b1e917 100644 --- a/include/nbl/builtin/hlsl/memory_accessor.hlsl +++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl @@ -112,8 +112,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase - enable_if_t get(const index_t ix, NBL_REF_ARG(T) value) + template + enable_if_t get(const I ix, NBL_REF_ARG(T) value) { NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t); // `vector` for now, we'll use `array` later when `bit_cast` gets fixed @@ -123,8 +123,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase >(aux); } - template - enable_if_t set(const index_t ix, NBL_CONST_REF_ARG(T) value) + template + enable_if_t set(const I ix, NBL_CONST_REF_ARG(T) value) { NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t); // `vector` for now, we'll use `array` later when `bit_cast` gets fixed @@ -209,11 +209,11 @@ struct Offset : impl::OffsetBase BaseAccessor accessor; - template - void set(index_t idx, T value) {accessor.set(idx+base_t::offset,value); } + template + void set(I idx, T value) {accessor.set(idx+base_t::offset,value); } - template - void get(index_t idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);} + template + void get(I idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);} template enable_if_t< diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl index 724887b995..3b511126b4 100644 --- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl +++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl @@ -4,6 +4,8 @@ #ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_ #define _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_ +#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl" + namespace nbl { namespace hlsl @@ -11,6 +13,20 @@ namespace hlsl namespace subgroup2 { +template +uint32_t LastSubgroupInvocation() +{ + if (AssumeAllActive) + return glsl::gl_SubgroupSize()-1; + else + return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true)); +} + +bool ElectLast() +{ + return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation(); +} + template struct Configuration { diff --git a/include/nbl/builtin/hlsl/tuple.hlsl b/include/nbl/builtin/hlsl/tuple.hlsl new file mode 100644 index 0000000000..a9c26090ea --- /dev/null +++ b/include/nbl/builtin/hlsl/tuple.hlsl @@ -0,0 +1,61 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_ + +#include "nbl/builtin/hlsl/type_traits.hlsl" + +namespace nbl +{ +namespace hlsl +{ + +template // TODO: in the future use BOOST_PP to make this +struct tuple +{ + T0 t0; + T1 t1; + T2 t2; +}; + +template +struct tuple_element; + +template +struct tuple +{ + T0 t0; +}; + +template +struct tuple +{ + T0 t0; + T1 t1; +}; +// specializations for less and less void elements + +// base case +template +struct tuple_element<0,tuple > +{ + using type = Head; +}; + +template +struct tuple_element<1,tuple > +{ + using type = Head; +}; + +template +struct tuple_element<2,tuple > +{ + using type = Head; +}; + +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl index 9aefc3b3d8..652cabd7c7 100644 --- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl +++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl @@ -28,6 +28,7 @@ struct vector_traits >\ NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\ };\ +DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3) DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4) diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl new file mode 100644 index 0000000000..62a9fb7bef --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl @@ -0,0 +1,63 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_ + + +#include "nbl/builtin/hlsl/functional.hlsl" +#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl" + + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +template) +struct reduction +{ + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) + static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::reduce fn; + return fn.template __call(dataAccessor, scratchAccessor); + } +}; + +template) +struct inclusive_scan +{ + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::scan fn; + fn.template __call(dataAccessor, scratchAccessor); + } +}; + +template) +struct exclusive_scan +{ + using scalar_t = typename BinOp::type_t; + + template && ArithmeticSharedMemoryAccessor) + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + impl::scan fn; + fn.template __call(dataAccessor, scratchAccessor); + } +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl new file mode 100644 index 0000000000..9a211899cb --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl @@ -0,0 +1,225 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/tuple.hlsl" +#include "nbl/builtin/hlsl/mpl.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +namespace impl +{ +template +struct virtual_wg_size_log2 +{ + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/virtual_wg_size_def.hlsl" + #undef SELECT + #undef MAX + #undef DEFINE_ASSIGN + + // must have at least enough level 0 outputs to feed a single subgroup + static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize"); + static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16"); +}; + +template +struct items_per_invocation +{ + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE VirtualWorkgroup:: + #define MIN(TYPE,ARG1,ARG2) mpl::min_v + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/items_per_invoc_def.hlsl" + #undef SELECT + #undef MAX + #undef MIN + #undef VIRTUAL_WG_SIZE + #undef DEFINE_ASSIGN + + using ItemsPerInvocation = tuple,integral_constant,integral_constant >; +}; +} + +template +struct ArithmeticConfiguration +{ + using virtual_wg_t = impl::virtual_wg_size_log2<_WorkgroupSizeLog2, _SubgroupSizeLog2>; + using items_per_invoc_t = impl::items_per_invocation; + using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation; + + #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE virtual_wg_t:: + #define ITEMS_PER_INVOC items_per_invoc_t:: + #define MAX(TYPE,ARG1,ARG2) mpl::max_v + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value::value + #include "impl/arithmetic_config_def.hlsl" + #undef SELECT + #undef MAX + #undef ITEMS_PER_INVOC + #undef VIRTUAL_WG_SIZE + #undef DEFINE_ASSIGN + + using ChannelStride = tuple,integral_constant,integral_constant >; // we don't use stride 0 + + static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize); + static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!"); + +#ifdef __HLSL_VERSION + static bool electLast() + { + return glsl::gl_SubgroupInvocationID()==SubgroupSize-1; + } +#endif + + // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups + // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex + static uint16_t virtualSubgroupID(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex) + { + return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID; + } + + // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1 + // specify the next level to store values for in template param + // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements + template0 && level::type::value; + const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u)); + const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation; + const uint16_t localOffset = outChannel * tuple_element::type::value + outInvocation; + + if (level==2) + { + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1; + return baseOffset + localOffset; + } + else + { + const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1); + return localOffset + paddingOffset; + } + } + + template0 && level(virtualID); + } + + // get the coalesced index in shared mem at the current level + template0 && level::type::value + invocationIndex; + const uint16_t paddingOffset = invocationIndex / SubgroupSize; + + if (level==2) + { + const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1; + return baseOffset + localOffset + paddingOffset; + } + else + return localOffset + paddingOffset; + } +}; + +#ifndef __HLSL_VERSION +namespace impl +{ +struct SVirtualWGSizeLog2 +{ + void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2) + { + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/virtual_wg_size_def.hlsl" + #undef SELECT + #undef MAX + #undef DEFINE_ASSIGN + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/virtual_wg_size_def.hlsl" + #undef DEFINE_ASSIGN +}; + +struct SItemsPerInvoc +{ + void init(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation) + { + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE virtualWgSizeLog2. + #define MIN(TYPE,ARG1,ARG2) hlsl::min(ARG1, ARG2) + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/items_per_invoc_def.hlsl" + #undef SELECT + #undef MAX + #undef MIN + #undef VIRTUAL_WG_SIZE + #undef DEFINE_ASSIGN + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/items_per_invoc_def.hlsl" + #undef DEFINE_ASSIGN +}; +} + +struct SArithmeticConfiguration +{ + void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation) + { + impl::SVirtualWGSizeLog2 virtualWgSizeLog2; + virtualWgSizeLog2.init(_WorkgroupSizeLog2, _SubgroupSizeLog2); + impl::SItemsPerInvoc itemsPerInvoc; + itemsPerInvoc.init(virtualWgSizeLog2, _ItemsPerInvocation); + + #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__; + #define VIRTUAL_WG_SIZE virtualWgSizeLog2. + #define ITEMS_PER_INVOC itemsPerInvoc. + #define MAX(TYPE,ARG1,ARG2) hlsl::max(ARG1, ARG2) + #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL) + #include "impl/arithmetic_config_def.hlsl" + #undef SELECT + #undef MAX + #undef ITEMS_PER_INVOC + #undef VIRTUAL_WG_SIZE + #undef DEFINE_ASSIGN + } + + #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID; + #include "impl/arithmetic_config_def.hlsl" + #undef DEFINE_ASSIGN +}; +#endif + +template +struct is_configuration : bool_constant {}; + +template +struct is_configuration > : bool_constant {}; + +template +NBL_CONSTEXPR bool is_configuration_v = is_configuration::value; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl new file mode 100644 index 0000000000..94f54409db --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl @@ -0,0 +1,34 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << WorkgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << SubgroupSizeLog2) + +DEFINE_ASSIGN(uint16_t, LevelCount, VIRTUAL_WG_SIZE levels) +DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << VIRTUAL_WG_SIZE value) + +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, ITEMS_PER_INVOC value0) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, ITEMS_PER_INVOC value1) +DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, ITEMS_PER_INVOC value2) + +DEFINE_ASSIGN(uint16_t, LevelInputCount_1, SELECT(uint16_t,(LevelCount==3), + MAX(uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize), + SubgroupSize*ItemsPerInvocation_1)) +DEFINE_ASSIGN(uint16_t, LevelInputCount_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize*ItemsPerInvocation_2,0)) +DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, LevelInputCount_1 / ItemsPerInvocation_1) + +DEFINE_ASSIGN(uint16_t, __padding, SELECT(uint16_t,(LevelCount==3),SubgroupSize-1,0)) +DEFINE_ASSIGN(uint16_t, __channelStride_1, SELECT(uint16_t,(LevelCount==3),VirtualInvocationsAtLevel1,SubgroupSize) + __padding) +DEFINE_ASSIGN(uint16_t, __channelStride_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize,0)) + +// user specified the shared mem size of Scalars +DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, SELECT(uint16_t,(LevelCount==1), + 0, + SELECT(uint16_t,(LevelCount==3), + LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1, + 0 + ) + LevelInputCount_1 + )) diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl new file mode 100644 index 0000000000..c32d7ef8bd --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl @@ -0,0 +1,8 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, MAX(int16_t,VIRTUAL_WG_SIZE WorkgroupSizeLog2-VIRTUAL_WG_SIZE SubgroupSizeLog2*VIRTUAL_WG_SIZE levels,0)) +DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation) +DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << SELECT(uint16_t,(VIRTUAL_WG_SIZE levels==3),MIN(uint16_t,ItemsPerInvocationProductLog2,2),ItemsPerInvocationProductLog2)) +DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << MAX(int16_t,ItemsPerInvocationProductLog2-2,0)) \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl new file mode 100644 index 0000000000..e4c4047f1d --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl @@ -0,0 +1,8 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2) +DEFINE_ASSIGN(uint16_t, levels, SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1)) +DEFINE_ASSIGN(uint16_t, value, MAX(uint16_t, _SubgroupSizeLog2*levels, _WorkgroupSizeLog2)) diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl new file mode 100644 index 0000000000..5b19c55fbd --- /dev/null +++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl @@ -0,0 +1,411 @@ +// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ +#define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_ + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/ballot.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/mpl.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace workgroup2 +{ + +namespace impl +{ + +template +struct reduce; + +template +struct scan; + +// 1-level scans +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + // doesn't use scratch smem, should be NOOP accessor + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + subgroup2::reduction reduction; + vector_t value; + dataAccessor.template get(uint16_t(glsl::gl_SubgroupInvocationID()), value); + return reduction(value); + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + // doesn't use scratch smem, should be NOOP accessor + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + vector_t value; + dataAccessor.template get(uint16_t(glsl::gl_SubgroupInvocationID()), value); + if (Exclusive) + { + subgroup2::exclusive_scan excl_scan; + value = excl_scan(value); + } + else + { + subgroup2::inclusive_scan incl_scan; + value = incl_scan(value); + } + dataAccessor.template set(uint16_t(glsl::gl_SubgroupInvocationID()), value); + } +}; + +// do level 0 scans for 2- and 3-level scans (same code) +template +struct reduce_level0 +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 0 scan + subgroup2::reduction reduction0; + [unroll] + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + vector_t scan_local; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local); + scan_local = reduction0(scan_local); + if (Config::electLast()) + { + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + scratchAccessor.template set(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + }; +}; + +template +struct scan_level0 +{ + using scalar_t = typename BinOp::type_t; + using vector_t = vector; // data accessor needs to be this type + + template + static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_t = subgroup2::ArithmeticParams; + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + subgroup2::inclusive_scan inclusiveScan0; + // level 0 scan + [unroll] + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + vector_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + value = inclusiveScan0(value); + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + if (Config::electLast()) + { + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx); + scratchAccessor.template set(bankedIndex, value[Config::ItemsPerInvocation_0-1]); // set last element of subgroup scan (reduction) to level 1 scan + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + } +}; + +// 2-level scans +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + reduce_level0::template __call(dataAccessor, scratchAccessor); + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 1 scan + subgroup2::reduction reduction1; + if (glsl::gl_SubgroupID() == 0) + { + vector_lv1_t lv1_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + lv1_val = reduction1(lv1_val); + + if (Config::electLast()) + scratchAccessor.template set(0, lv1_val[Config::ItemsPerInvocation_1-1]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + scalar_t reduce_val; + scratchAccessor.template get(0,reduce_val); + return reduce_val; + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv1_t = subgroup2::ArithmeticParams; + BinOp binop; + + scan_level0::template __call(dataAccessor, scratchAccessor); + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() == 0) + { + vector_lv1_t lv1_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + lv1_val = inclusiveScan1(lv1_val); + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 0 + [unroll] + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); + scalar_t left = BinOp::identity; + if (idx != 0 || glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex,left); + if (Exclusive) + { + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + [unroll] + for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + value[i] = binop(left, value[i-1]); + value[0] = binop(left, left_last_elem); + } + else + { + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + value[i] = binop(left, value[i]); + } + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + } + } +}; + +// 3-level scans +template +struct reduce +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; + using vector_lv2_t = vector; + + template + scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv1_t = subgroup2::ArithmeticParams; + using params_lv2_t = subgroup2::ArithmeticParams; + BinOp binop; + + reduce_level0::template __call(dataAccessor, scratchAccessor); + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 1 scan + subgroup2::reduction reduction1; + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) + { + vector_lv1_t lv1_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + lv1_val = reduction1(lv1_val); + if (Config::electLast()) + { + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID())); + scratchAccessor.template set(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]); + } + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 2 scan + subgroup2::reduction reduction2; + if (glsl::gl_SubgroupID() == 0) + { + vector_lv2_t lv2_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + lv2_val = reduction2(lv2_val); + if (Config::electLast()) + scratchAccessor.template set(0, lv2_val[Config::ItemsPerInvocation_2-1]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + scalar_t reduce_val; + scratchAccessor.template get(0,reduce_val); + return reduce_val; + } +}; + +template +struct scan +{ + using scalar_t = typename BinOp::type_t; + using vector_lv0_t = vector; // data accessor needs to be this type + using vector_lv1_t = vector; + using vector_lv2_t = vector; + + template + void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor) + { + using config_t = subgroup2::Configuration; + using params_lv1_t = subgroup2::ArithmeticParams; + using params_lv2_t = subgroup2::ArithmeticParams; + BinOp binop; + + scan_level0::template __call(dataAccessor, scratchAccessor); + + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + // level 1 scan + subgroup2::inclusive_scan inclusiveScan1; + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) + { + vector_lv1_t lv1_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + lv1_val = inclusiveScan1(lv1_val); + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // level 2 scan + subgroup2::inclusive_scan inclusiveScan2; + if (glsl::gl_SubgroupID() == 0) + { + const uint16_t lastChannel = Config::ItemsPerInvocation_1 - uint16_t(1u); + vector_lv2_t lv2_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + { + const uint16_t inputSubgroupID = invocationIndex * Config::ItemsPerInvocation_2 + i; + const uint16_t inputSubgroupLastInvocation = inputSubgroupID * Config::SubgroupSize + (Config::SubgroupSize - uint16_t(1u)); + scratchAccessor.template get(Config::template sharedLoadIndex<1>(inputSubgroupLastInvocation, lastChannel),lv2_val[i]); + } + lv2_val = inclusiveScan2(lv2_val); + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 1 + if (glsl::gl_SubgroupID() < Config::LevelInputCount_2) + { + vector_lv1_t lv1_val; + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template get(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]); + + scalar_t lv2_scan = BinOp::identity; + const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u)); + if (glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex, lv2_scan); + + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++) + scratchAccessor.template set(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan)); + } + scratchAccessor.workgroupExecutionAndMemoryBarrier(); + + // combine with level 0 + [unroll] + for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++) + { + vector_lv0_t value; + dataAccessor.template get(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + + const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx); + scalar_t left = BinOp::identity; + if (idx != 0 || glsl::gl_SubgroupID() != 0) + scratchAccessor.template get(bankedIndex,left); + if (Exclusive) + { + scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID())); + [unroll] + for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--) + value[i] = binop(left, value[i-1]); + value[0] = binop(left, left_last_elem); + } + else + { + [unroll] + for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++) + value[i] = binop(left, value[i]); + } + dataAccessor.template set(idx * Config::WorkgroupSize + virtualInvocationIndex, value); + } + } +}; + +} + +} +} +} + +#endif diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h index 4e7147c904..1abebf23ea 100644 --- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h +++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h @@ -40,7 +40,7 @@ struct ProtoPipeline final inline operator bool() const {return m_vxShader.get();} inline core::smart_refctd_ptr createPipeline( - const asset::IPipelineBase::SShaderSpecInfo& fragShader, + const video::IGPUPipelineBase::SShaderSpecInfo& fragShader, video::IGPUPipelineLayout* layout, video::IGPURenderpass* renderpass, const uint32_t subpassIx=0, @@ -58,17 +58,13 @@ struct ProtoPipeline final { const auto orientationAsUint32 = static_cast(swapchainTransform); - asset::IPipelineBase::SShaderSpecInfo::spec_constant_map_t specConstants; - specConstants[0] = {.data=&orientationAsUint32,.size=sizeof(orientationAsUint32)}; - - const asset::IPipelineBase::SShaderSpecInfo shaders[2] = { - {.shader=m_vxShader.get(), .entryPoint = "main" ,.stage = hlsl::ESS_VERTEX,.entries=&specConstants}, - fragShader - }; + IGPUPipelineBase::SShaderEntryMap specConstants; + specConstants[0] = std::span{ reinterpret_cast(&orientationAsUint32), sizeof(orientationAsUint32)}; IGPUGraphicsPipeline::SCreationParams params[1]; params[0].layout = layout; - params[0].shaders = shaders; + params[0].vertexShader = { .shader = m_vxShader.get(), .entryPoint = "main", .entries = &specConstants }; + params[0].fragmentShader = fragShader; params[0].cached = { .vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state .primitiveAssembly = {}, diff --git a/include/nbl/macros.h b/include/nbl/macros.h index 4927f21899..fe93201a11 100644 --- a/include/nbl/macros.h +++ b/include/nbl/macros.h @@ -81,7 +81,7 @@ //! Workarounds for compiler specific bugs // MSVC 2019 is a special snowflake -#if defined(_MSC_VER) && _MSC_VER>=1920 +#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER>=1920 #define NBL_TYPENAME_4_STTC_MBR typename #else #define NBL_TYPENAME_4_STTC_MBR diff --git a/include/nbl/system/demote_promote_writer_readers_lock.h b/include/nbl/system/demote_promote_writer_readers_lock.h index 6823c26c27..5447e65f3e 100644 --- a/include/nbl/system/demote_promote_writer_readers_lock.h +++ b/include/nbl/system/demote_promote_writer_readers_lock.h @@ -271,7 +271,7 @@ class demote_promote_writer_readers_lock_debug struct DefaultPreemptionCheck { - bool operator()(state_lock_value_t oldState) + bool operator()(const state_lock_value_t oldState) { return false; } @@ -361,13 +361,13 @@ class dpwr_lock_guard_base /** * @brief Checks whether this guard is currently locking the lock `lk` */ - bool hasLocked(dpwr_lock_t& lk) const + bool hasLocked(const dpwr_lock_t& lk) const { return m_lock == &lk; } protected: - dpwr_lock_guard_base(dpwr_lock_t& lk) noexcept : m_lock(&lk) {} + dpwr_lock_guard_base(const dpwr_lock_t& lk) noexcept : m_lock(&lk) {} dpwr_lock_t* m_lock; }; @@ -385,7 +385,7 @@ class dpwr_read_lock_guard_debug : public impl::dpwr_lock_guard_base; using dpwr_write_lock_guard_debug_t = dpwr_write_lock_guard_debug; - dpwr_read_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} + dpwr_read_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} explicit dpwr_read_lock_guard_debug(dpwr_lock_t& lk) : dpwr_read_lock_guard_debug(lk, std::adopt_lock_t()) { this->m_lock->read_lock(); @@ -406,7 +406,7 @@ class dpwr_write_lock_guard_debug : public impl::dpwr_lock_guard_base; using dpwr_read_lock_guard_debug_t = dpwr_read_lock_guard_debug; - dpwr_write_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} + dpwr_write_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {} explicit dpwr_write_lock_guard_debug(dpwr_lock_t& lk) : dpwr_write_lock_guard_debug(lk, std::adopt_lock_t()) { this->m_lock->write_lock(); diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index c996000e04..e6d17ddf3e 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -47,8 +47,8 @@ class CVulkanDeviceMemoryBacked : public Interface }; #ifndef _NBL_VIDEO_C_VULKAN_DEVICE_MEMORY_BACKED_CPP_ -extern template CVulkanDeviceMemoryBacked; -extern template CVulkanDeviceMemoryBacked; +extern template class CVulkanDeviceMemoryBacked; +extern template class CVulkanDeviceMemoryBacked; #endif } // end namespace nbl::video diff --git a/include/nbl/video/CVulkanRayTracingPipeline.h b/include/nbl/video/CVulkanRayTracingPipeline.h index 82d8c777b6..a9bc476f43 100644 --- a/include/nbl/video/CVulkanRayTracingPipeline.h +++ b/include/nbl/video/CVulkanRayTracingPipeline.h @@ -41,10 +41,13 @@ class CVulkanRayTracingPipeline final : public IGPURayTracingPipeline const VkPipeline m_vkPipeline; ShaderGroupHandleContainer m_shaderGroupHandles; - uint16_t m_raygenStackSize; core::smart_refctd_dynamic_array m_missStackSizes; core::smart_refctd_dynamic_array m_hitGroupStackSizes; core::smart_refctd_dynamic_array m_callableStackSizes; + uint32_t m_missGroupCount; + uint32_t m_hitGroupCount; + uint32_t m_callableGroupCount; + uint16_t m_raygenStackSize; uint32_t getRaygenIndex() const; uint32_t getMissBaseIndex() const; diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h index 5d8f0ca29b..1bb4fb0c66 100644 --- a/include/nbl/video/IGPUAccelerationStructure.h +++ b/include/nbl/video/IGPUAccelerationStructure.h @@ -45,7 +45,7 @@ class IGPUAccelerationStructure : public IBackendObject #endif //! builds - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo { public: @@ -98,39 +98,6 @@ class IGPUAccelerationStructure : public IBackendObject } }; - // copies - enum class COPY_MODE : uint8_t - { - CLONE = 0, - COMPACT = 1, - SERIALIZE = 2, - DESERIALIZE = 3, - }; - struct CopyInfo - { - const IGPUAccelerationStructure* src = nullptr; - IGPUAccelerationStructure* dst = nullptr; - COPY_MODE mode = COPY_MODE::CLONE; - }; - template - struct CopyToMemoryInfo - { - const IGPUAccelerationStructure* src = nullptr; - asset::SBufferBinding dst = nullptr; - COPY_MODE mode = COPY_MODE::SERIALIZE; - }; - using DeviceCopyToMemoryInfo = CopyToMemoryInfo; - using HostCopyToMemoryInfo = CopyToMemoryInfo; - template - struct CopyFromMemoryInfo - { - asset::SBufferBinding src = nullptr; - IGPUAccelerationStructure* dst = nullptr; - COPY_MODE mode = COPY_MODE::DESERIALIZE; - }; - using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; - using HostCopyFromMemoryInfo = CopyFromMemoryInfo; - // this will return false also if your deferred operation is not ready yet, so please use in combination with `isPending()` virtual bool wasCopySuccessful(const IDeferredOperation* const deferredOp) = 0; @@ -176,12 +143,36 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);} + // copies + struct CopyInfo + { + const IGPUBottomLevelAccelerationStructure* src = nullptr; + IGPUAccelerationStructure* dst = nullptr; + bool compact = false; + }; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyToMemoryInfo + { + const IGPUBottomLevelAccelerationStructure* src = nullptr; + asset::SBufferBinding dst = nullptr; + }; + using DeviceCopyToMemoryInfo = CopyToMemoryInfo; + using HostCopyToMemoryInfo = CopyToMemoryInfo; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyFromMemoryInfo + { + asset::SBufferBinding src = nullptr; + IGPUBottomLevelAccelerationStructure* dst = nullptr; + }; + using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; + using HostCopyFromMemoryInfo = CopyFromMemoryInfo; + // read the comments in the .hlsl file, AABB builds ignore certain fields - using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; + using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*` using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*; using MaxInputCounts = const uint32_t* const; - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo final : IGPUAccelerationStructure::BuildInfo { private: @@ -203,7 +194,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat NBL_API2 uint32_t valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const; // really expensive to call, `valid` only calls it when `_NBL_DEBUG` is defined - inline bool validGeometry(size_t& totalPrims, const AABBs& geometry, const BuildRangeInfo& buildRangeInfo) const + inline bool validGeometry(size_t& totalPrims, const AABBs& geometry, const BuildRangeInfo& buildRangeInfo) const { constexpr size_t AABBalignment = 8ull; // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03659 @@ -222,7 +213,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat totalPrims += buildRangeInfo.primitiveCount; return true; } - inline bool validGeometry(size_t& totalPrims, const Triangles& geometry, const BuildRangeInfo& buildRangeInfo) const + inline bool validGeometry(size_t& totalPrims, const Triangles& geometry, const BuildRangeInfo& buildRangeInfo) const { // if (!dstAS->validVertexFormat(geometry.vertexFormat)) @@ -306,7 +297,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat *(oit++) = core::smart_refctd_ptr(srcAS); *(oit++) = core::smart_refctd_ptr(dstAS); - if (buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + if (buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { for (auto i=0u; i* triangles = nullptr; - const AABBs* aabbs; + const Triangles* triangles = nullptr; + const AABBs* aabbs; }; }; using DeviceBuildInfo = BuildInfo; @@ -388,12 +379,43 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;} + // + using blas_smart_ptr_t = core::smart_refctd_ptr; + + // copies + struct CopyInfo + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + IGPUTopLevelAccelerationStructure* dst = nullptr; + bool compact = false; + }; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyToMemoryInfo + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + asset::SBufferBinding dst = nullptr; + // [optional] Query the tracked BLASes + core::smart_refctd_dynamic_array trackedBLASes = nullptr; + }; + using DeviceCopyToMemoryInfo = CopyToMemoryInfo; + using HostCopyToMemoryInfo = CopyToMemoryInfo; + template requires (!std::is_const_v && std::is_base_of_v) + struct CopyFromMemoryInfo + { + asset::SBufferBinding src = nullptr; + IGPUTopLevelAccelerationStructure* dst = nullptr; + // [optional] Provide info about what BLAS references to hold onto after the copy. For performance make sure the list is compact (without repeated elements). + std::span trackedBLASes = {}; + }; + using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo; + using HostCopyFromMemoryInfo = CopyFromMemoryInfo; + // read the comments in the .hlsl file using BuildRangeInfo = hlsl::acceleration_structures::top_level::BuildRangeInfo; using DirectBuildRangeRangeInfos = const BuildRangeInfo*; using MaxInputCounts = const uint32_t; - template + template requires (!std::is_const_v && std::is_base_of_v) struct BuildInfo final : IGPUAccelerationStructure::BuildInfo { private: @@ -638,6 +660,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed SRTMotionInstance largestUnionMember = {}; static_assert(alignof(SRTMotionInstance)==8ull); + + public: + constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember); }; using DevicePolymorphicInstance = PolymorphicInstance; using HostPolymorphicInstance = PolymorphicInstance; @@ -664,69 +689,108 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr // using build_ver_t = uint32_t; + // + inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;} // this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission inline build_ver_t registerNextBuildVer() { - return m_pendingBuildVer++; + return ++m_pendingBuildVer; } - // - using blas_smart_ptr_t = core::smart_refctd_ptr; // returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked` - inline build_ver_t getTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const uint32_t first=0) const + inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const { if (!count) - return 0; + return; // stop multiple threads messing with us std::lock_guard lk(m_trackingLock); - const uint32_t toWrite = std::min(std::max(m_trackedBLASes.size(),first)-first,tracked ? (*count):0xffFFffFFu); - *count = toWrite; - if (tracked && toWrite) - { - auto it = m_trackedBLASes.begin(); - // cmon its an unordered map, iterator should have operator += - for (auto i=0; isize():0; + if (!tracked || !pBLASes) + return; + auto it = pBLASes->begin(); + for (auto i = 0; i - inline bool setTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer) + inline void insertTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer) { + if (buildVer==0) + return; // stop multiple threads messing with us std::lock_guard lk(m_trackingLock); - // stop out of order callbacks - if (buildVer<=m_completedBuildVer) - return false; - m_completedBuildVer = buildVer; - // release already tracked BLASes - m_trackedBLASes.clear(); - // sanity check, TODO: this should be an atomic_max on the `m_pendingBuildVer` - if (m_completedBuildVer>m_pendingBuildVer) - m_pendingBuildVer = m_completedBuildVer; + // insert in the right order + auto prev = m_pendingBuilds.before_begin(); + for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>buildVer; prev=it++) {} + auto inserted = m_pendingBuilds.emplace_after(prev); // now fill the contents - m_trackedBLASes.insert(begin,end); - return true; + inserted->BLASes.insert(begin,end); + inserted->ordinal = buildVer; } - // a little utility to make sure nothing from this build version and before gets tracked - inline bool clearTrackedBLASes(const build_ver_t buildVer) + template + inline build_ver_t pushTrackedBLASes(const Iterator begin, const Iterator end) + { + const auto buildVer = registerNextBuildVer(); + insertTrackedBLASes(begin,end,buildVer); + return buildVer; + } + // a little utility to make sure nothing from before this build version gets tracked + inline void clearTrackedBLASes(const build_ver_t buildVer) { - return setTrackedBLASes(nullptr,nullptr,buildVer); + // stop multiple threads messing with us + std::lock_guard lk(m_trackingLock); + clearTrackedBLASes_impl(buildVer); } protected: inline IGPUTopLevelAccelerationStructure(core::smart_refctd_ptr&& dev, SCreationParams&& params) : Base(), IGPUAccelerationStructure(std::move(dev),std::move(params)), - m_maxInstanceCount(params.maxInstanceCount),m_trackedBLASes() {} - + m_maxInstanceCount(params.maxInstanceCount) {} const uint32_t m_maxInstanceCount; + + private: + struct DynamicUpCastingSpanIterator + { + inline bool operator!=(const DynamicUpCastingSpanIterator& other) const {return ptr!=other.ptr;} + + inline DynamicUpCastingSpanIterator operator++() {return {ptr++};} + + inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast(ptr->get());} + + std::span>::iterator ptr; + }; + friend class ILogicalDevice; + friend class IQueue; + inline const core::unordered_set* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const + { + const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;}); + if (found==m_pendingBuilds.end()) + return nullptr; + return &found->BLASes; + } + inline void clearTrackedBLASes_impl(const build_ver_t buildVer) + { + // find first element less or equal to `buildVer` + auto prev = m_pendingBuilds.before_begin(); + for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>=buildVer; prev=it++) {} + m_pendingBuilds.erase_after(prev,m_pendingBuilds.end()); + } + + std::atomic m_pendingBuildVer = 0; // TODO: maybe replace with new readers/writers lock mutable std::mutex m_trackingLock; - std::atomic m_pendingBuildVer = 0; - build_ver_t m_completedBuildVer = 0; - core::unordered_set m_trackedBLASes; + // TODO: this definitely needs improving with MultiEventTimelines (which also can track deferred Host ops) but then one needs to track semaphore signal-wait deps so we know what "state copy" a compaction wants + // Deferred Op must complete AFTER a submit, otherwise race condition. + // If we make a linked list of pending builds, then we just need to pop completed builds (traverse until current found) + struct STrackingInfo + { + core::unordered_set BLASes; + // when the build got + build_ver_t ordinal; + }; + // a little misleading, the element is the most recently completed one + core::forward_list m_pendingBuilds; }; } diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h index f79ed17a50..bb6460754a 100644 --- a/include/nbl/video/IGPUCommandBuffer.h +++ b/include/nbl/video/IGPUCommandBuffer.h @@ -92,7 +92,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject case STATE::EXECUTABLE: [[fallthrough]]; case STATE::PENDING: - if (m_noCommands) + if (!m_noCommands) return false; [[fallthrough]]; default: @@ -260,13 +260,21 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject inline bool buildAccelerationStructures(const std::span infos, const IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos) { if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,buildRangeInfos); totalGeometryCount) - return buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount); + if (buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount)) + { + m_noCommands = false; + return true; + } return false; } inline bool buildAccelerationStructures(const std::span infos, const IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos) { if (buildAccelerationStructures_common(infos,buildRangeInfos)) - return buildAccelerationStructures_impl(infos,buildRangeInfos); + if (buildAccelerationStructures_impl(infos,buildRangeInfos)) + { + m_noCommands = false; + return true; + } return false; } // We don't allow different indirect command addresses due to https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pIndirectDeviceAddresses-03646 @@ -299,18 +307,25 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,maxPrimitiveOrInstanceCounts,indirectRangeBuffer); totalGeometryCount) { + bool success; if constexpr(std::is_same_v) - return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount); + success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount); else - return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts); + success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts); + if (success) + m_noCommands = false; + return success; } return false; } //! acceleration structure transfers - bool copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo); - bool copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo); - bool copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo); + template requires std::is_base_of_v + bool copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo); //! state setup bool bindComputePipeline(const IGPUComputePipeline* const pipeline); @@ -536,7 +551,31 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject bool executeCommands(const uint32_t count, IGPUCommandBuffer* const* const cmdbufs); // in case you want the commandbuffer to hold onto things as long as its not RESET - bool recordReferences(const std::span refs); + template + inline bool recordReferences(Iterator begin, const Iterator end) + { + auto oit = reserveReferences(std::distance(begin,end)); + if (oit) + while (begin!=end) + *(oit++) = core::smart_refctd_ptr(*(begin++)); + return oit; + } + inline bool recordReferences(const std::span refs) {return recordReferences(refs.begin(),refs.end());} + + // in case you want the commandbuffer to overwrite the BLAS tracking, e.g. you recorded TLAS building commands directly using `getNativeHandle()` to get the commandbuffer + template + inline bool recordBLASReferenceOverwrite(IGPUTopLevelAccelerationStructure* tlas, Iterator beginBLASes, const Iterator endBLASes) + { + const auto size = std::distance(beginBLASes,endBLASes); + auto oit = reserveReferences(size); + if (oit) + { + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=tlas}); + while (beginBLASes!=endBLASes) + *(oit++) = core::smart_refctd_ptr(*(beginBLASes++)); + } + return oit; + } virtual bool insertDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0; virtual bool beginDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0; @@ -627,9 +666,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject const uint64_t* const pIndirectOffsets, const uint32_t* const pIndirectStrides, const uint32_t* const pMaxInstanceCounts ) = 0; - virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0; - virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) = 0; - virtual bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) = 0; + virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0; + virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) = 0; + virtual bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) = 0; virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0; virtual bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) = 0; @@ -710,7 +749,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject m_state = STATE::INITIAL; m_boundDescriptorSetsRecord.clear(); - m_TLASToBLASReferenceSets.clear(); + m_TLASTrackingOps.clear(); m_boundGraphicsPipeline= nullptr; m_boundComputePipeline= nullptr; m_boundRayTracingPipeline= nullptr; @@ -728,7 +767,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject { deleteCommandList(); m_boundDescriptorSetsRecord.clear(); - m_TLASToBLASReferenceSets.clear(); + m_TLASTrackingOps.clear(); m_boundGraphicsPipeline= nullptr; m_boundComputePipeline= nullptr; m_boundRayTracingPipeline= nullptr; @@ -862,16 +901,33 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject template requires nbl::is_any_of_v bool invalidDrawIndirectCount(const asset::SBufferBinding& indirectBinding, const asset::SBufferBinding& countBinding, const uint32_t maxDrawCount, const uint32_t stride); + core::smart_refctd_ptr* reserveReferences(const uint32_t size); // This bound descriptor set record doesn't include the descriptor sets whose layout has _any_ one of its bindings // created with IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT. core::unordered_map m_boundDescriptorSetsRecord; - - // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it. - // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes. - // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records. - core::unordered_map> m_TLASToBLASReferenceSets; + + // If the user wants the builds and copies to be tracking, and make the TLAS remember the BLASes that have been built into it. + // The Command Pool already tracks resources referenced in the Build Infos or Copies From Memory (Deserializations), so we only need pointers into those records. + struct TLASTrackingWrite + { + std::span> src; + IGPUTopLevelAccelerationStructure* dst; + }; + struct TLASTrackingCopy + { + const IGPUTopLevelAccelerationStructure* src; + IGPUTopLevelAccelerationStructure* dst; + }; + struct TLASTrackingRead + { + const IGPUTopLevelAccelerationStructure* src; + // For a copy to memory (Serialization), we need to dump the BLASes references + core::smart_refctd_dynamic_array dst; + }; + // operations as they'll be performed in order + core::vector> m_TLASTrackingOps; const IGPUGraphicsPipeline* m_boundGraphicsPipeline; const IGPUComputePipeline* m_boundComputePipeline; @@ -892,6 +948,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject NBL_ENUM_ADD_BITWISE_OPERATORS(IGPUCommandBuffer::USAGE); #ifndef _NBL_VIDEO_I_GPU_COMMAND_BUFFER_CPP_ +extern template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); + extern template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h index 49e44dfcc1..1b6cbd69f2 100644 --- a/include/nbl/video/IGPUComputePipeline.h +++ b/include/nbl/video/IGPUComputePipeline.h @@ -6,20 +6,21 @@ #include "nbl/asset/IPipeline.h" +#include "nbl/asset/IComputePipeline.h" -#include "nbl/video/SPipelineCreationParams.h" +#include "nbl/video/IGPUPipeline.h" #include "nbl/video/SPipelineCreationParams.h" namespace nbl::video { -class IGPUComputePipeline : public IBackendObject, public asset::IPipeline +class IGPUComputePipeline : public IGPUPipeline> { - using pipeline_t = asset::IPipeline; + using pipeline_t = asset::IComputePipeline; public: - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams + struct SCreationParams final : SPipelineCreationParams { // By construction we satisfy from: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-03365 @@ -28,7 +29,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline(pipeline_t::SCreationParams::FLAGS::F) + #define base_flag(F) static_cast(pipeline_t::FLAGS::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), @@ -46,28 +47,35 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline getRequiredSubgroupStages() const + { + if (shader.shader && shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4) { - if (shader.entries->size()>0x7fffffff) - return {}; - count = static_cast(shader.entries->size()); + return hlsl::ESS_COMPUTE; } - return {.count=dataSize ? count:0,.dataSize=static_cast(dataSize)}; + return {}; } - inline std::span getShaders() const {return {&shader,1}; } - + IGPUPipelineLayout* layout = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; - IPipelineBase::SShaderSpecInfo shader = {}; + SCachedCreationParams cached = {}; + SShaderSpecInfo shader = {}; }; inline core::bitflag getCreationFlags() const {return m_flags;} @@ -76,10 +84,9 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline&& _layout, const core::bitflag _flags) : - IBackendObject(core::smart_refctd_ptr(_layout->getOriginDevice())), - pipeline_t(std::move(_layout)), - m_flags(_flags) {} + inline IGPUComputePipeline(const SCreationParams& params) : + IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags) + {} virtual ~IGPUComputePipeline() = default; const core::bitflag m_flags; diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h index 8240bcea94..7027252b0f 100644 --- a/include/nbl/video/IGPUGraphicsPipeline.h +++ b/include/nbl/video/IGPUGraphicsPipeline.h @@ -6,20 +6,21 @@ #include "nbl/video/IGPUPipelineLayout.h" #include "nbl/video/IGPURenderpass.h" +#include "nbl/video/IGPUPipeline.h" namespace nbl::video { -class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline +class IGPUGraphicsPipeline : public IGPUPipeline> { using pipeline_t = asset::IGraphicsPipeline; public: - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams - { + struct SCreationParams final : public SPipelineCreationParams + { public: - #define base_flag(F) static_cast(pipeline_t::SCreationParams::FLAGS::F) + #define base_flag(F) static_cast(pipeline_t::FLAGS::F) enum class FLAGS : uint64_t { NONE = base_flag(NONE), @@ -36,30 +37,79 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel if (!layout) return {}; SSpecializationValidationResult retval = {.count=0,.dataSize=0}; - const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const IPipelineBase::SShaderSpecInfo& info)->bool + if (!layout) + return {}; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576 + if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount()) + return {}; + + // TODO: check rasterization samples, etc. + //rp->getCreationParameters().subpasses[i] + + core::bitflag stagePresence = {}; + + auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage) { - const auto dataSize = info.valid(); - if (dataSize<0) - return false; - else if (dataSize==0) - return true; - - const size_t count = info.entries ? info.entries->size():0x80000000ull; - if (count>0x7fffffff) - return {}; - retval += {.count=dataSize ? static_cast(count):0,.dataSize=static_cast(dataSize)}; - return retval; - }); - if (!valid) + if (!specInfo.shader) return true; + if (!specInfo.accumulateSpecializationValidationResult(&retval)) return false; + stagePresence |= stage; + return true; + }; + if (!processSpecInfo(vertexShader, hlsl::ShaderStage::ESS_VERTEX)) return {}; + if (!processSpecInfo(tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)) return {}; + if (!processSpecInfo(tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) return {}; + if (!processSpecInfo(geometryShader, hlsl::ShaderStage::ESS_GEOMETRY)) return {}; + if (!processSpecInfo(fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT)) return {}; + + if (!hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType)) return {}; + + if (!vertexShader.shader) return {}; + return retval; } - inline std::span getShaders() const {return shaders;} + inline core::bitflag getRequiredSubgroupStages() const + { + core::bitflag stages = {}; + auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) + { + if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + stages |= stage; + } + }; + processSpecInfo(vertexShader, hlsl::ESS_VERTEX); + processSpecInfo(tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL); + processSpecInfo(tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION); + processSpecInfo(geometryShader, hlsl::ESS_GEOMETRY); + processSpecInfo(fragmentShader, hlsl::ESS_FRAGMENT); + return stages; + } + + IGPUPipelineLayout* layout = nullptr; + SShaderSpecInfo vertexShader; + SShaderSpecInfo tesselationControlShader; + SShaderSpecInfo tesselationEvaluationShader; + SShaderSpecInfo geometryShader; + SShaderSpecInfo fragmentShader; + SCachedCreationParams cached = {}; + renderpass_t* renderpass = nullptr; // TODO: Could guess the required flags from SPIR-V introspection of declared caps core::bitflag flags = FLAGS::NONE; - }; + + inline uint32_t getShaderCount() const + { + uint32_t count = 0; + count += (vertexShader.shader != nullptr); + count += (tesselationControlShader.shader != nullptr); + count += (tesselationEvaluationShader.shader != nullptr); + count += (geometryShader.shader != nullptr); + count += (fragmentShader.shader != nullptr); + return count; + } + }; inline core::bitflag getCreationFlags() const {return m_flags;} @@ -67,9 +117,10 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel virtual const void* getNativeHandle() const = 0; protected: - IGPUGraphicsPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr(params.layout->getOriginDevice())), - pipeline_t(params), m_flags(params.flags) {} - virtual ~IGPUGraphicsPipeline() = default; + IGPUGraphicsPipeline(const SCreationParams& params) : + IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags) + {} + virtual ~IGPUGraphicsPipeline() override = default; const core::bitflag m_flags; }; diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h new file mode 100644 index 0000000000..c22ad998db --- /dev/null +++ b/include/nbl/video/IGPUPipeline.h @@ -0,0 +1,149 @@ + + +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ +#define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_ + +#include "nbl/video/IGPUPipelineLayout.h" +#include "nbl/video/SPipelineCreationParams.h" +#include "nbl/asset/ICPUPipeline.h" +#include "nbl/asset/IPipeline.h" + +namespace nbl::video +{ + +class IGPUPipelineBase { + public: + struct SShaderSpecInfo + { + + //! Structure specifying a specialization map entry + /* + Note that if specialization constant ID is used + in a shader, \bsize\b and \boffset'b must match + to \isuch an ID\i accordingly. + + By design the API satisfies: + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773 + https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774 + */ + //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline. + using spec_constant_id_t = uint32_t; + + using SSpecConstantValue = std::span; + + inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const + { + if (!entries) return {}; + + const auto found = entries->find(_specConstID); + if (found != entries->end() && found->second.size()) return found->second; + else return {}; + } + + static constexpr int32_t INVALID_SPEC_INFO = -1; + inline int32_t valid() const + { + if (!shader) return INVALID_SPEC_INFO; + + // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707 + if (entryPoint.empty()) return INVALID_SPEC_INFO; + + // Impossible to efficiently check anything from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686 + // and from: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756 + // to: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987 + + int64_t specData = 0; + if (entries) + { + for (const auto& entry : *entries) + { + if (!entry.second.size()) + return INVALID_SPEC_INFO; + specData += entry.second.size(); + } + } + if (specData>0x7fffffff) + return INVALID_SPEC_INFO; + return static_cast(specData); + } + + inline bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const + { + const auto dataSize = valid(); + if (dataSize < 0) + return false; + if (dataSize == 0) + return true; + + const size_t count = entries ? entries->size() : 0x80000000ull; + if (count > 0x7fffffff) + return false; + *retval += { + .count = dataSize ? static_cast(count) : 0, + .dataSize = static_cast(dataSize), + }; + return *retval; + } + + const asset::IShader* shader = nullptr; + std::string_view entryPoint = ""; + + asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN; //!< Default value of 8 means no requirement + + // Container choice implicitly satisfies: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911 + using entry_map_t = core::unordered_map; + const entry_map_t* entries; + // By requiring Nabla Core Profile features we implicitly satisfy: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785 + // Also because our API is sane, it satisfies the following by construction: + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 + + + static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t* outEntries) + { + SShaderSpecInfo specInfo; + specInfo.shader = cpuSpecInfo.shader.get(); + specInfo.entryPoint = cpuSpecInfo.entryPoint; + specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize; + outEntries->clear(); + for (const auto&[key, value] : cpuSpecInfo.entries) + { + outEntries->insert({ key, { value.data(), value.size() } }); + } + specInfo.entries = outEntries; + return specInfo; + }; + }; + + using SShaderEntryMap = SShaderSpecInfo::entry_map_t; + +}; + +// Common Base class for pipelines +template + requires (std::is_base_of_v, PipelineNonBackendObjectBase> && !std::is_base_of_v) +class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase, public IGPUPipelineBase +{ + protected: + + template + explicit IGPUPipeline(core::smart_refctd_ptr&& device, Args&&... args) : + PipelineNonBackendObjectBase(std::forward(args)...), IBackendObject(std::move(device)) + {} + virtual ~IGPUPipeline() = default; + +}; + +} + +#endif diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h index fb8c371193..690e6685d3 100644 --- a/include/nbl/video/IGPURayTracingPipeline.h +++ b/include/nbl/video/IGPURayTracingPipeline.h @@ -10,28 +10,43 @@ namespace nbl::video { -class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingPipeline +class IGPURayTracingPipeline : public IGPUPipeline> { using pipeline_t = asset::IRayTracingPipeline; public: - - struct SShaderGroupHandle + struct SHitGroup { - private: - uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize]; + SShaderSpecInfo closestHit; + SShaderSpecInfo anyHit; + SShaderSpecInfo intersection; }; - static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize); - struct SHitGroupStackSize + struct SCreationParams : public SPipelineCreationParams { - uint16_t closestHit; - uint16_t anyHit; - uint16_t intersection; - }; + using FLAGS = pipeline_t::FLAGS; - struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams - { + struct SShaderGroupsParams + { + + SShaderSpecInfo raygen; + std::span misses; + std::span hits; + std::span callables; + + inline uint32_t getShaderGroupCount() const + { + return 1 + hits.size() + misses.size() + callables.size(); + } + + }; + + IGPUPipelineLayout* layout = nullptr; + SShaderGroupsParams shaderGroups; + + SCachedCreationParams cached = {}; + // TODO: Could guess the required flags from SPIR-V introspection of declared caps + core::bitflag flags = FLAGS::NONE; inline SSpecializationValidationResult valid() const { @@ -39,32 +54,104 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP return {}; SSpecializationValidationResult retval = { - .count=0, - .dataSize=0, + .count = 0, + .dataSize = 0, }; - const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const asset::IPipelineBase::SShaderSpecInfo& info)->bool + + if (!shaderGroups.raygen.accumulateSpecializationValidationResult(&retval)) + return {}; + + for (const auto& shaderGroup : shaderGroups.hits) { - const auto dataSize = info.valid(); - if (dataSize<0) - return false; - else if (dataSize==0) - return true; - - const size_t count = info.entries ? info.entries->size():0x80000000ull; - if (count>0x7fffffff) + if (shaderGroup.intersection.shader) + { + if (!shaderGroup.intersection.accumulateSpecializationValidationResult(&retval)) return {}; - retval += {.count=dataSize ? static_cast(count):0,.dataSize=static_cast(dataSize)}; - return retval; - }); - if (!valid) - return {}; + } + + if (shaderGroup.closestHit.shader) + { + if (!shaderGroup.closestHit.accumulateSpecializationValidationResult(&retval)) + return {}; + } + + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470 + if (flags.hasFlags(FLAGS::NO_NULL_ANY_HIT_SHADERS) && !shaderGroup.anyHit.shader) + return {}; + + if (shaderGroup.anyHit.shader) + { + if (!shaderGroup.anyHit.accumulateSpecializationValidationResult(&retval)) + return {}; + } + + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471 + if (flags.hasFlags(FLAGS::NO_NULL_CLOSEST_HIT_SHADERS) && !shaderGroup.intersection.shader) + return {}; + } + + for (const auto& miss : shaderGroups.misses) + { + if (miss.shader) + { + if (!miss.accumulateSpecializationValidationResult(&retval)) + return {}; + } + } + + for (const auto& callable : shaderGroups.callables) + { + if (callable.shader) + { + if (!callable.accumulateSpecializationValidationResult(&retval)) + return {}; + } + } + + if (!shaderGroups.raygen.shader) return {}; + return retval; } - inline std::span getShaders() const { return shaders; } + inline core::bitflag getRequiredSubgroupStages() const + { + core::bitflag stages = {}; + auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage) + { + if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) { + stages |= stage; + } + }; + processSpecInfo(shaderGroups.raygen, hlsl::ESS_RAYGEN); + for (const auto& miss : shaderGroups.misses) + processSpecInfo(miss, hlsl::ESS_MISS); + for (const auto& hit : shaderGroups.hits) + { + processSpecInfo(hit.closestHit, hlsl::ESS_CLOSEST_HIT); + processSpecInfo(hit.anyHit, hlsl::ESS_ANY_HIT); + processSpecInfo(hit.intersection, hlsl::ESS_INTERSECTION); + } + for (const auto& callable : shaderGroups.callables) + processSpecInfo(callable, hlsl::ESS_CALLABLE); + return stages; + } }; + struct SShaderGroupHandle + { + private: + uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize]; + }; + static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize); + + struct SHitGroupStackSize + { + uint16_t closestHit; + uint16_t anyHit; + uint16_t intersection; + }; + inline core::bitflag getCreationFlags() const { return m_flags; } // Vulkan: const VkPipeline* @@ -82,8 +169,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP virtual uint16_t getDefaultStackSize() const = 0; protected: - IGPURayTracingPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr(params.layout->getOriginDevice())), - pipeline_t(params), + IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags) {} diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 49364f3a54..def3ee0979 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -3,7 +3,7 @@ #include "nbl/asset/asset.h" #include "nbl/asset/utils/ISPIRVOptimizer.h" -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/asset/utils/CCompilerSet.h" #include "nbl/video/SPhysicalDeviceFeatures.h" @@ -413,19 +413,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe }; // fun fact: you can use garbage/invalid pointers/offset for the Device/Host addresses of the per-geometry data, just make sure what was supposed to be null is null template requires nbl::is_any_of_v, - IGPUBottomLevelAccelerationStructure::Triangles, - IGPUBottomLevelAccelerationStructure::AABBs, - IGPUBottomLevelAccelerationStructure::AABBs + asset::IBottomLevelAccelerationStructure::Triangles, + asset::IBottomLevelAccelerationStructure::Triangles, + asset::IBottomLevelAccelerationStructure::AABBs, + asset::IBottomLevelAccelerationStructure::AABBs > inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( - const core::bitflag flags, + const bool hostBuild, + const core::bitflag flags, const bool motionBlur, const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { - if (invalidFeaturesForASBuild(motionBlur)) + if (invalidFeaturesForASBuild(hostBuild,motionBlur)) { NBL_LOG_ERROR("Required features are not enabled"); return {}; @@ -456,6 +457,30 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount; for (auto i=0u; igetBufferFormatUsages()[geom.vertexFormat].accelerationStructureVertex) + { + NBL_LOG_ERROR("Vertex Format %d not supported as Acceleration Structure Vertex Position Input on this Device",geom.vertexFormat); + return {}; + } + // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity + } + if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::AABBs) + { + if (!flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) + { + NBL_LOG_ERROR("Primitive type is AABB but build flag says BLAS build is not AABBs"); + return {}; + } + // TODO: check stride and geometry flags for validity + } if (pMaxPrimitiveCounts[i] > primsFree) { NBL_LOG_ERROR("Primitive count exceeds device limit"); @@ -464,16 +489,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe primsFree -= pMaxPrimitiveCounts[i]; } - return getAccelerationStructureBuildSizes_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes( const bool hostBuild, - const core::bitflag flags, + const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount ) const { - if (invalidFeaturesForASBuild(motionBlur)) + if (invalidFeaturesForASBuild(hostBuild,motionBlur)) { NBL_LOG_ERROR("Required features are not enabled"); return {}; @@ -497,7 +522,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } // little utility template - inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount) const + inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag flags, const bool motionBlur, const uint32_t maxInstanceCount) const { return getAccelerationStructureBuildSizes(std::is_same_v,asset::ICPUBuffer>,flags,motionBlur,maxInstanceCount); } @@ -568,12 +593,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe { auto tlas = set.first; // we know the build is completed immediately after performing it, so we get our pending stamp then - tlas->setTrackedBLASes(set.second.begin(),set.second.end(),tlas->registerNextBuildVer()); + // ideally we should get our build version when the work of the deferred op gets executed for the first time + const auto buildVer = tlas->pushTrackedBLASes({set.second.begin()},{set.second.end()}); + tlas->clearTrackedBLASes(buildVer); } } // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes - core::unordered_map> m_TLASToBLASReferenceSets; + core::unordered_map>> m_TLASToBLASReferenceSets; } callback = {}; auto& tracking = deferredOperation->m_resourceTracking; @@ -585,10 +612,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe if constexpr (IsTLAS) { const auto blasCount = info.trackedBLASes.size(); - if (blasCount) - callback.m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast(oit-blasCount),blasCount}; - else - callback.m_TLASToBLASReferenceSets[info.dstAS] = {}; + callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount}; } } if constexpr (IsTLAS) @@ -633,7 +657,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return writeAccelerationStructuresProperties_impl(accelerationStructures,type,data,stride); } // Host-side copy, DEFERRAL IS NOT OPTIONAL - inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const AccelerationStructure::CopyInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -647,15 +672,48 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructure_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + // not sure if even legal, but it would deadlock us + if (src==dst) + return; + uint32_t buildVer; + { + // stop multiple threads messing with us + std::lock_guard lk(src->m_trackingLock); + // we know the build is completed immediately after performing it, so we get our pending stamp then + // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time + const auto* pSrcBLASes = src->getPendingBuildTrackedBLASes(src->getPendingBuildVer()); + const std::span emptySpan = {}; + buildVer = pSrcBLASes ? dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end()); + } + dst->clearTrackedBLASes(buildVer); + } + + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + const IGPUTopLevelAccelerationStructure* src; + IGPUTopLevelAccelerationStructure* dst; + } callback = {.src=copyInfo.src,.dst=copyInfo.dst}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; } - inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyToMemoryInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -674,13 +732,43 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructureToMemory_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src), core::smart_refctd_ptr(copyInfo.dst.buffer) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + // stop multiple threads messing with us + std::lock_guard lk(src->m_trackingLock); + // we know the build is completed immediately after performing it, so we get our pending stamp then + // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time + const auto ver = src->getPendingBuildVer(); + uint32_t count = dst->size(); + src->getPendingBuildTrackedBLASes(&count,dst->data(),ver); + if (count>dst->size()) + logger->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,dst->size()); + } + + // device keeps it alive for entire lifetime of the callback + system::ILogger* logger; + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + const IGPUTopLevelAccelerationStructure* src; + core::smart_refctd_dynamic_array dst; + } callback = {.logger=m_logger.get(),.src=copyInfo.src,.dst=copyInfo.trackedBLASes}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; } - inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) + template requires std::is_base_of_v + inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyFromMemoryInfo& copyInfo) { if (!acquireDeferredOperation(deferredOperation)) { @@ -699,10 +787,32 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } auto result = copyAccelerationStructureFromMemory_impl(deferredOperation,copyInfo); if (result==DEFERRABLE_RESULT::DEFERRED) + { deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{ core::smart_refctd_ptr(copyInfo.src.buffer), core::smart_refctd_ptr(copyInfo.dst) }); + constexpr bool IsTLAS = std::is_same_v; + if constexpr (IsTLAS) + { + const size_t offset = deferredOperation->m_resourceTracking.size(); + deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.end(),copyInfo.trackedBLASes.begin(),copyInfo.trackedBLASes.end()); + struct TLASCallback + { + // upon completion set the BLASes tracked + inline void operator()(IDeferredOperation*) const + { + const auto buildVer = dst->pushTrackedBLASes({src->begin()},{src->end()}); + dst->clearTrackedBLASes(buildVer); + } + + // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes + std::span> src; + IGPUTopLevelAccelerationStructure* dst; + } callback = {.src={deferredOperation->m_resourceTracking.data()+offset,copyInfo.trackedBLASes.size()},.dst=copyInfo.dst}; + deferredOperation->m_callback = std::move(callback); + } + } return result!=DEFERRABLE_RESULT::SOME_ERROR; } @@ -725,8 +835,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a pipeline layout (@see ICPUPipelineLayout) core::smart_refctd_ptr createPipelineLayout( const std::span pcRanges={}, - core::smart_refctd_ptr&& _layout0=nullptr, core::smart_refctd_ptr&& _layout1=nullptr, - core::smart_refctd_ptr&& _layout2=nullptr, core::smart_refctd_ptr&& _layout3=nullptr + core::smart_refctd_ptr&& _layout0=nullptr, core::smart_refctd_ptr&& _layout1=nullptr, + core::smart_refctd_ptr&& _layout2=nullptr, core::smart_refctd_ptr&& _layout3=nullptr ) { if ((_layout0 && !_layout0->wasCreatedBy(this))) @@ -1020,20 +1130,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const = 0; virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( const bool hostBuild, const core::bitflag flags, @@ -1055,16 +1165,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe const IGPUTopLevelAccelerationStructure::BuildRangeInfo* const pBuildRangeInfos, const uint32_t totalGeometryCount ) = 0; virtual bool writeAccelerationStructuresProperties_impl(const std::span accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) = 0; - virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) = 0; + virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) = 0; constexpr static inline auto MaxStagesPerPipeline = 6u; virtual core::smart_refctd_ptr createDescriptorSetLayout_impl(const std::span bindings, const uint32_t maxSamplersCount) = 0; virtual core::smart_refctd_ptr createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 ) = 0; virtual core::smart_refctd_ptr createDescriptorPool_impl(const IDescriptorPool::SCreateInfo& createInfo) = 0; @@ -1096,8 +1206,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual core::smart_refctd_ptr createRenderpass_impl(const IGPURenderpass::SCreationParams& params, IGPURenderpass::SCreationParamValidationResult&& validation) = 0; virtual core::smart_refctd_ptr createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0; - template - inline CreationParams::SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, ExtraLambda&& extra) + template + inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span params) { if (pipelineCache && !pipelineCache->wasCreatedBy(this)) { @@ -1110,7 +1220,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return {}; } - typename CreationParams::SSpecializationValidationResult retval = {.count=0,.dataSize=0}; + SSpecializationValidationResult retval = {.count=0,.dataSize=0}; for (auto i=0; i createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual void createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual void createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) = 0; virtual core::smart_refctd_ptr createQueryPool_impl(const IQueryPool::SCreationParams& params) = 0; @@ -1262,7 +1315,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe uint16_t firstQueueIndex = 0u; }; const std::array m_queueFamilyInfos; - core::smart_refctd_ptr m_spirvDebloater; + core::smart_refctd_ptr m_spirvTrimmer; private: const SPhysicalDeviceLimits& getPhysicalDeviceLimits() const; @@ -1340,8 +1393,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe } return false; } - template - bool invalidFeaturesForASBuild(const bool motionBlur) const + bool invalidFeaturesForASBuild(const bool hostBuild, const bool motionBlur) const { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkGetAccelerationStructureBuildSizesKHR-accelerationStructure-08933 if (!m_enabledFeatures.accelerationStructure) @@ -1350,7 +1402,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe return true; } // not sure of VUID - if (std::is_same_v && !m_enabledFeatures.accelerationStructureHostCommands) + if (hostBuild && !m_enabledFeatures.accelerationStructureHostCommands) { NBL_LOG_ERROR("Feature `acceleration structure` host commands is not enabled"); return true; @@ -1535,7 +1587,7 @@ inline bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyInde return false; }; // CANNOT CHECK: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkImageMemoryBarrier2-oldLayout-01197 - if (mismatchedLayout.operator()(barrier.oldLayout) || mismatchedLayout.operator()(barrier.newLayout)) + if (mismatchedLayout.template operator()(barrier.oldLayout) || mismatchedLayout.template operator()(barrier.newLayout)) return false; } diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h index 28336b15cc..63073beb33 100644 --- a/include/nbl/video/IQueue.h +++ b/include/nbl/video/IQueue.h @@ -125,12 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable class DeferredSubmitCallback final { // - struct STLASBuildMetadata - { - core::unordered_set m_BLASes; - uint32_t m_buildVer; - }; - core::unordered_map m_TLASToBLASReferenceSets; + core::unordered_map m_TLASOverwrites; // using smart_ptr = core::smart_refctd_ptr; core::smart_refctd_dynamic_array m_resources; diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h index d052a819bd..882ac16648 100644 --- a/include/nbl/video/ISwapchain.h +++ b/include/nbl/video/ISwapchain.h @@ -454,21 +454,22 @@ class ISwapchain : public IBackendObject { return params.deduce(getOriginDevice()->getPhysicalDevice(),m_params.surface.get(),{&m_params.sharedParams.presentMode.value,1},{&m_params.sharedParams.compositeAlpha.value,1},{&m_params.sharedParams.preTransform.value,1}); } - inline core::smart_refctd_ptr recreate(SSharedCreationParams params={}) + inline core::smart_refctd_ptr recreate(SSharedCreationParams params) { if (!deduceRecreationParams(params)) return nullptr; return recreate_impl(std::move(params)); } + inline core::smart_refctd_ptr recreate() { return recreate({}); } // Vulkan: const VkSwapchainKHR* virtual const void* getNativeHandle() const = 0; // returns the maximum number of time acquires with infinite timeout which can be called before releasing the image index through present. - virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0u; + virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0; // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal. - virtual uint8_t getMaxAcquiresInFlight() const = 0u; + virtual uint8_t getMaxAcquiresInFlight() const = 0; // only public because MultiTimelineEventHandlerST needs to know about it class DeferredFrameSemaphoreDrop final diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h index 489bff4343..3a25560ae4 100644 --- a/include/nbl/video/SPipelineCreationParams.h +++ b/include/nbl/video/SPipelineCreationParams.h @@ -11,6 +11,31 @@ namespace nbl::video { +struct SSpecializationValidationResult +{ + constexpr static inline uint32_t Invalid = ~0u; + inline operator bool() const + { + return count!=Invalid && dataSize!=Invalid; + } + + inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other) + { + // TODO: check for overflow before adding + if (*this && other) + { + count += other.count; + dataSize += other.dataSize; + } + else + *this = {}; + return *this; + } + + uint32_t count = Invalid; + uint32_t dataSize = Invalid; +}; + // For now, due to API design we implicitly satisfy: // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-08771 // to: @@ -18,30 +43,6 @@ namespace nbl::video template struct SPipelineCreationParams { - struct SSpecializationValidationResult - { - constexpr static inline uint32_t Invalid = ~0u; - inline operator bool() const - { - return count!=Invalid && dataSize!=Invalid; - } - - inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other) - { - // TODO: check for overflow before adding - if (*this && other) - { - count += other.count; - dataSize += other.dataSize; - } - else - *this = {}; - return *this; - } - - uint32_t count = Invalid; - uint32_t dataSize = Invalid; - }; constexpr static inline int32_t NotDerivingFromPreviousPipeline = -1; inline bool isDerivative() const diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h index 9405accf78..a3d6aa4c8b 100644 --- a/include/nbl/video/TimelineEventHandlers.h +++ b/include/nbl/video/TimelineEventHandlers.h @@ -410,7 +410,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable sum += handler->count(); else { - const auto local = handler->poll_impl(std::forward(args)...); + const auto local = handler->template poll_impl(std::forward(args)...); bailed = local.bailed; // if don't have any events left, remove the timeline if (local.eventsLeft) diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h index ee7d068ef3..5b085b2d3b 100644 --- a/include/nbl/video/asset_traits.h +++ b/include/nbl/video/asset_traits.h @@ -193,7 +193,7 @@ struct asset_traits // the asset type using asset_t = asset::ICPUBottomLevelAccelerationStructure; // we don't need to descend during DFS into other assets - constexpr static inline bool HasChildren = true; + constexpr static inline bool HasChildren = false; // the video type using video_t = IGPUBottomLevelAccelerationStructure; // lookup type diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index ecec442366..2fdfe28e3c 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -34,7 +34,6 @@ #include "nbl/video/utilities/CDrawIndirectAllocator.h" #include "nbl/video/utilities/CSubpassKiln.h" #include "nbl/video/utilities/IUtilities.h" -#include "nbl/video/utilities/IGPUObjectFromAssetConverter.h" #include "nbl/video/utilities/SPhysicalDeviceFilter.h" #include "nbl/video/utilities/CSimpleResizeSurface.h" #include "nbl/video/utilities/CSmoothResizeSurface.h" diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h index db61ee7857..935b79b1e5 100644 --- a/include/nbl/video/utilities/CAssetConverter.h +++ b/include/nbl/video/utilities/CAssetConverter.h @@ -159,7 +159,7 @@ class CAssetConverter : public core::IReferenceCounted //! select build flags uint8_t allowUpdate : 1 = false; uint8_t allowCompaction : 1 = false; - BuildPreference preference : 2 = BuildPreference::Invalid; + BuildPreference preference : 2 = BuildPreference::None; uint8_t lowMemory : 1 = false; //! things that control the build uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway @@ -171,7 +171,7 @@ class CAssetConverter : public core::IReferenceCounted template std::pair combine_impl(const CRTP& _this, const CRTP& other) const { - if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid) + if (_this.preference!=other.preference && _this.preference!=BuildPreference::None && other.preference!=BuildPreference::None) return {false,_this}; CRTP retval = _this; retval.isMotion |= other.isMotion; @@ -887,6 +887,9 @@ class CAssetConverter : public core::IReferenceCounted IGPUPipelineCache* pipelineCache = nullptr; // optional, defaults to the device IDeviceMemoryAllocator* allocator = nullptr; + // optional, defaults to worst case (Apple Silicon page size) + uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14; + uint32_t scratchForHostASBuildMinAllocSize = 1<<14; }; // Split off from inputs because only assets that build on IPreHashed need uploading struct SConvertParams @@ -943,7 +946,8 @@ class CAssetConverter : public core::IReferenceCounted uint32_t sampledImageBindingCount = 1<<10; uint32_t storageImageBindingCount = 11<<10; // specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build - CAsyncSingleBufferSubAllocatorST>* scratchForDeviceASBuild = nullptr; + using scratch_for_device_AS_build_t = CAsyncSingleBufferSubAllocatorST>; + scratch_for_device_AS_build_t* scratchForDeviceASBuild = nullptr; std::pmr::memory_resource* scratchForHostASBuild = nullptr; // needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds IDeviceMemoryAllocator* compactedASAllocator = nullptr; @@ -957,7 +961,14 @@ class CAssetConverter : public core::IReferenceCounted public: template - using staging_cache_t = core::unordered_map::video_t*,typename CCache::key_t>; + struct staging_cache_key + { + core::smart_refctd_ptr::video_t> gpuRef; + typename CCache::key_t cacheKey; + }; + // it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup + template + using staging_cache_t = core::unordered_map::video_t*,staging_cache_key>; inline SReserveResult(SReserveResult&&) = default; inline SReserveResult(const SReserveResult&) = delete; @@ -987,7 +998,12 @@ class CAssetConverter : public core::IReferenceCounted assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]); return m_maxASBuildScratchSize[forHostOps]; } -// TODO: `getMinCompactedASAllocatorSpace` + // We do all compactions on the Device for simplicity + inline uint64_t getMinCompactedASAllocatorSpace() const + { + assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild()); + return m_compactedASMaxMemory; + } // tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild` inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;} // tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild` @@ -1000,8 +1016,7 @@ class CAssetConverter : public core::IReferenceCounted // tells you if you need to provide a valid `SConvertParams::compactedASAllocator` inline bool willCompactAS() const { - assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild()); - return m_willCompactSomeAS; + return getMinCompactedASAllocatorSpace()!=0; } // @@ -1044,21 +1059,10 @@ class CAssetConverter : public core::IReferenceCounted return enqueueSuccess; } - // public only because `GetDependantVisit` needs it - struct SDeferredTLASWrite - { - inline bool operator==(const SDeferredTLASWrite& other) const - { - return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement; - } - - IGPUDescriptorSet* dstSet; - uint32_t binding; - uint32_t arrayElement; - core::smart_refctd_ptr tlas; - }; private: friend class CAssetConverter; + // internal classes + template friend class GetDependantVisit; inline SReserveResult() = default; @@ -1078,69 +1082,68 @@ class CAssetConverter : public core::IReferenceCounted core::vector> m_shaders; // need a more explicit list of GPU objects that need device-assisted conversion - template - struct SConversionRequestBase - { - // canonical asset (the one that provides content) - core::smart_refctd_ptr canonical; - // gpu object to transfer canonical's data to or build it from - asset_traits::video_t* gpuObj; - }; - using SConvReqBuffer = SConversionRequestBase; - core::vector m_bufferConversions; - struct SConvReqImage : SConversionRequestBase + core::unordered_map> m_bufferConversions; + struct SConvReqImage { + core::smart_refctd_ptr canonical = nullptr; uint16_t recomputeMips = 0; }; - core::vector m_imageConversions; + core::unordered_map m_imageConversions; template - struct SConvReqAccelerationStructure : SConversionRequestBase + struct SConvReqAccelerationStructure { - constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1; - inline bool compact() const {return compactedASWriteOffset!=WontCompact;} - using build_f = typename asset_traits::video_t::BUILD_FLAGS; inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast(_flags);} inline build_f getBuildFlags() const {return static_cast(buildFlags);} - - uint64_t scratchSize; - uint64_t compactedASWriteOffset : 48 = WontCompact; - uint64_t buildFlags : 16 = static_cast(build_f::NONE); + core::smart_refctd_ptr canonical = nullptr; + uint64_t scratchSize : 47 = 0; + uint64_t buildFlags : 16 = 0; + uint64_t compact : 1; + // scratch + input size also accounting for worst case padding due to alignment + uint64_t buildSize; }; - using SConvReqBLAS = SConvReqAccelerationStructure; - core::vector m_blasConversions[2]; - using SConvReqTLAS = SConvReqAccelerationStructure; - core::vector m_tlasConversions[2]; + using SConvReqBLASMap = core::unordered_map>; + SConvReqBLASMap m_blasConversions[2]; + struct SConvReqTLAS : SConvReqAccelerationStructure + { + // This tracks non-root BLASes which are needed for a subsequent TLAS build. + // Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS, + // we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage + // Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely. + using cpu_to_gpu_blas_map_t = core::unordered_map>; + cpu_to_gpu_blas_map_t instanceMap; + }; + using SConvReqTLASMap = core::unordered_map; + SConvReqTLASMap m_tlasConversions[2]; - // 0 for device builds, 1 for host builds + // array index 0 for device builds, 1 for host builds uint64_t m_minASBuildScratchSize[2] = {0,0}; uint64_t m_maxASBuildScratchSize[2] = {0,0}; -// TODO: make the compaction count the size - // We do all compactions on the Device for simplicity - uint8_t m_willCompactSomeAS : 1 = false; - // This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early. - struct BLASUsedInTLASBuild + uint64_t m_compactedASMaxMemory = 0; + // + struct SDeferredTLASWrite { - // This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve` - core::smart_refctd_ptr gpuBLAS; - uint64_t buildDuringConvertCall : 1 = false; - // internal micro-refcount which lets us know when we should remove the entry from the map below - uint64_t remainingUsages : 63 = 0; + inline bool operator==(const SDeferredTLASWrite& other) const + { + return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data; + } + + IGPUDescriptorSet* dstSet; + // binding and array element rolled up into one + IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset; }; - using cpu_to_gpu_blas_map_t = core::unordered_map; - cpu_to_gpu_blas_map_t m_blasBuildMap; struct SDeferredTLASWriteHasher { inline size_t operator()(const SDeferredTLASWrite& write) const { - size_t retval = std::bit_cast(write.dstSet); - core::hash_combine(retval,write.binding); - core::hash_combine(retval,write.arrayElement); + size_t retval = write.storageOffset.data; + core::hash_combine(retval,write.dstSet); return retval; } }; - core::unordered_set m_deferredTLASDescriptorWrites; + using compacted_tlas_rewrite_set_t = core::unordered_set; + compacted_tlas_rewrite_set_t m_potentialTLASRewrites; // core::bitflag m_queueFlags = IQueue::FAMILY_FLAGS::NONE; diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h index 9a02915187..66f6871dc6 100644 --- a/include/nbl/video/utilities/CComputeBlit.h +++ b/include/nbl/video/utilities/CComputeBlit.h @@ -67,7 +67,7 @@ class CComputeBlit : public core::IReferenceCounted // required CAssetConverter* converter; // in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU - const asset::ICPUPipelineLayout* layout; + asset::ICPUPipelineLayout* layout; // must be Uniform Texel Buffer descriptor type hlsl::SBindingInfo kernelWeights; // must be Sampled Image descriptor type diff --git a/include/nbl/video/utilities/CSubpassKiln.h b/include/nbl/video/utilities/CSubpassKiln.h index 7df6cc0caa..c41ec3dd7e 100644 --- a/include/nbl/video/utilities/CSubpassKiln.h +++ b/include/nbl/video/utilities/CSubpassKiln.h @@ -198,7 +198,7 @@ class CSubpassKiln if (begin==end) return; - bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().indirectDrawCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end); + bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().drawIndirectCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end); } protected: diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h deleted file mode 100644 index 600197611b..0000000000 --- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_ -#define _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_ - -#include "nbl/core/declarations.h" -#include "nbl/core/alloc/LinearAddressAllocator.h" - -#include "nbl/video/ISemaphore.h" -#include "nbl/video/ILogicalDevice.h" - -#if 0 -auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array -{ - const size_t assetCount = std::distance(_begin, _end); - auto res = core::make_refctd_dynamic_array >(assetCount); - auto toCreateAndBuild = std::vector(); - auto buildRangeInfos = std::vector(); - toCreateAndBuild.reserve(assetCount); - buildRangeInfos.reserve(assetCount); - // Lambda function: creates the acceleration structure and It's buffer - auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas) - { - // Create buffer with cpuas->getAccelerationStructureSize - IGPUBuffer::SCreationParams gpuBufParams = {}; - gpuBufParams.size = asSize; - gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT; - auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams)); - auto mreqs = gpubuf->getMemoryReqs(); - mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - assert(gpubufMem.isValid()); - - // Create GPUAccelerationStructure with that buffer - IGPUAccelerationStructure::SCreationParams creatationParams = {}; - creatationParams.bufferRange.buffer = gpubuf; - creatationParams.bufferRange.offset = 0; - creatationParams.bufferRange.size = asSize; - creatationParams.flags = cpuas->getCreationParameters().flags; - creatationParams.type = cpuas->getCreationParameters().type; - return _params.device->createAccelerationStructure(std::move(creatationParams)); - }; - - for (ptrdiff_t i = 0u; i < assetCount; ++i) - { - const asset::ICPUAccelerationStructure* cpuas = _begin[i]; - - if(cpuas->hasBuildInfo()) - { - // Add to toBuild vector of ICPUAccelerationStructure - toCreateAndBuild.push_back(cpuas); - buildRangeInfos.push_back(const_cast(cpuas->getBuildRanges().begin())); - } - else if(cpuas->getAccelerationStructureSize() > 0) - { - res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas); - } - } - - if(toCreateAndBuild.empty() == false) - { - bool hostBuildCommands = false; // get from SFeatures - if(hostBuildCommands) - { - _NBL_TODO(); - } - else - { - core::vector cpuBufferDeps; - constexpr uint32_t MaxGeometryPerBuildInfo = 16; - constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData -> vertex+index+transformation - cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry); - - // Get CPUBuffer Dependencies - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i]; - - auto buildInfo = cpuas->getBuildInfo(); - assert(buildInfo != nullptr); - - auto geoms = buildInfo->getGeometries().begin(); - auto geomsCount = buildInfo->getGeometries().size(); - if(geomsCount == 0) - { - assert(false); - continue; - } - - for(uint32_t g = 0; g < geomsCount; ++g) - { - const auto& geom = geoms[g]; - if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES) - { - if(geom.data.triangles.indexData.isValid()) - { - auto cpuBuf = geom.data.triangles.indexData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - if(geom.data.triangles.vertexData.isValid()) - { - auto cpuBuf = geom.data.triangles.vertexData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - if(geom.data.triangles.transformData.isValid()) - { - auto cpuBuf = geom.data.triangles.transformData.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - else if(geom.type == asset::IAccelerationStructure::EGT_AABBS) - { - if(geom.data.aabbs.data.isValid()) - { - auto cpuBuf = geom.data.aabbs.data.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES) - { - if(geom.data.instances.data.isValid()) - { - auto cpuBuf = geom.data.instances.data.buffer.get(); - cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT); - cpuBufferDeps.push_back(cpuBuf); - } - } - } - } - - // Convert CPUBuffer Deps to GPUBuffers - core::vector redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps); - auto gpuBufs = getGPUObjectsFromAssets(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params); - _params.waitForCreationToComplete(); - _params.beginCommandBuffers(); - size_t bufIter = 0ull; - - // Fill buildGeomInfos partially (to later ge Get AS Size before build command) - std::vector buildGeomInfos(toCreateAndBuild.size()); - - using GPUGeometry = IGPUAccelerationStructure::Geometry; - std::vector gpuGeoms; - gpuGeoms.reserve(assetCount * MaxGeometryPerBuildInfo); - - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i]; - - auto cpuBuildInfo = cpuas->getBuildInfo(); - auto & gpuBuildInfo = buildGeomInfos[i]; - - gpuBuildInfo.type = cpuBuildInfo->type; - gpuBuildInfo.buildFlags = cpuBuildInfo->buildFlags; - gpuBuildInfo.buildMode = cpuBuildInfo->buildMode; - assert(cpuBuildInfo->buildMode == asset::IAccelerationStructure::EBM_BUILD); - - // Fill Later: - gpuBuildInfo.srcAS = nullptr; - gpuBuildInfo.dstAS = nullptr; - gpuBuildInfo.scratchAddr = {}; - - auto cpu_geoms = cpuBuildInfo->getGeometries().begin(); - auto geomsCount = cpuBuildInfo->getGeometries().size(); - if(geomsCount == 0) - { - assert(false); - continue; - } - - size_t startGeom = gpuGeoms.size(); - size_t endGeom = gpuGeoms.size() + geomsCount; - - for(uint32_t g = 0; g < geomsCount; ++g) - { - const auto& cpu_geom = cpu_geoms[g]; - - GPUGeometry gpu_geom = {}; - gpu_geom.type = cpu_geom.type; - gpu_geom.flags = cpu_geom.flags; - - if(cpu_geom.type == asset::IAccelerationStructure::EGT_TRIANGLES) - { - gpu_geom.data.triangles.vertexFormat = cpu_geom.data.triangles.vertexFormat; - gpu_geom.data.triangles.vertexStride = cpu_geom.data.triangles.vertexStride; - gpu_geom.data.triangles.maxVertex = cpu_geom.data.triangles.maxVertex; - gpu_geom.data.triangles.indexType = cpu_geom.data.triangles.indexType; - - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.indexData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.indexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.indexData.offset; - } - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.vertexData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.vertexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.vertexData.offset; - } - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.triangles.transformData.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.triangles.transformData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.transformData.offset; - } - } - else if(cpu_geom.type == asset::IAccelerationStructure::EGT_AABBS) - { - gpu_geom.data.aabbs.stride = cpu_geom.data.aabbs.stride; - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.aabbs.data.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.aabbs.data.offset = gpubuf->getOffset() + cpu_geom.data.aabbs.data.offset; - } - } - else if(cpu_geom.type == asset::IAccelerationStructure::EGT_INSTANCES) - { - { - IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get(); - gpu_geom.data.instances.data.buffer = core::smart_refctd_ptr(gpubuf->getBuffer()); - gpu_geom.data.instances.data.offset = gpubuf->getOffset() + cpu_geom.data.instances.data.offset; - } - } - - gpuGeoms.push_back(gpu_geom); - } - - gpuBuildInfo.geometries = core::SRange(gpuGeoms.data() + startGeom, gpuGeoms.data() + endGeom); - } - - // Get SizeInfo for each CPUAS -> Create the AS -> Get Total Scratch Buffer Size - std::vector buildSizes(toCreateAndBuild.size()); - uint64_t totalScratchBufferSize = 0ull; - uint64_t maxScratchBufferSize = 0ull; - for (ptrdiff_t i = 0u, toBuildIndex = 0u; i < assetCount; ++i) - { - const asset::ICPUAccelerationStructure* cpuas = _begin[i]; - if(cpuas->hasBuildInfo() == false) - { - // Only those with buildInfo (index in toCreateAndBuild vector) will get passed - continue; - } - - assert(cpuas == toCreateAndBuild[toBuildIndex]); - assert(toBuildIndex < toCreateAndBuild.size()); - - auto buildRanges = cpuas->getBuildRanges().begin(); - auto buildRangesCount = cpuas->getBuildRanges().size(); - - auto & gpuBuildInfo = buildGeomInfos[toBuildIndex]; - - std::vector maxPrimCount(buildRangesCount); - for(auto b = 0; b < buildRangesCount; b++) - maxPrimCount[b] = buildRanges[b].primitiveCount; - - auto buildSize = _params.device->getAccelerationStructureBuildSizes(gpuBuildInfo, maxPrimCount.data()); - buildSizes[i] = buildSize; - - auto gpuAS = allocateBufferAndCreateAccelerationStructure(buildSize.accelerationStructureSize, cpuas); - res->operator[](i) = gpuAS; - - // complete the buildGeomInfos (now only thing left is to allocate and set scratchAddr.buffer) - buildGeomInfos[toBuildIndex].dstAS = gpuAS.get(); - buildGeomInfos[toBuildIndex].scratchAddr.offset = totalScratchBufferSize; - - totalScratchBufferSize += buildSize.buildScratchSize; - core::max(maxScratchBufferSize, buildSize.buildScratchSize); // maxScratchBufferSize has no use now (unless we changed this function to build 1 by 1 instead of batch builds or have some kind of memory limit?) - ++toBuildIndex; - } - - // Allocate Scratch Buffer - IGPUBuffer::SCreationParams gpuScratchBufParams = {}; - gpuScratchBufParams.size = totalScratchBufferSize; - gpuScratchBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; - auto gpuScratchBuf = _params.device->createBuffer(std::move(gpuScratchBufParams)); - auto mreqs = gpuScratchBuf->getMemoryReqs(); - mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits(); - auto gpuScratchBufMem = _params.device->allocate(mreqs, gpuScratchBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - - for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i) - { - auto & gpuBuildInfo = buildGeomInfos[i]; - gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf; - } - - // Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS) - auto & fence = _params.fences[EQU_COMPUTE]; - fence = _params.device->createFence(static_cast(0)); - core::smart_refctd_ptr cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf; - - IQueue::SSubmitInfo submit; - { - submit.commandBufferCount = 1u; - submit.commandBuffers = &cmdbuf.get(); - submit.waitSemaphoreCount = 0u; - submit.pWaitDstStageMask = nullptr; - submit.pWaitSemaphores = nullptr; - uint32_t waitSemaphoreCount = 0u; - } - - assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING); - cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data()); - cmdbuf->end(); - - // TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build - - core::smart_refctd_ptr sem; - - if (_params.perQueue[EQU_COMPUTE].semaphore) - sem = _params.device->createSemaphore(); - - auto* sem_ptr = sem.get(); - auto* fence_ptr = fence.get(); - - submit.signalSemaphoreCount = sem_ptr?1u:0u; - submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr; - - _params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr); - if (_params.perQueue[EQU_COMPUTE].semaphore) - _params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem); - } - } - - return res; -} -#endif - -#endif diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index 09877b0d8f..00776ba01d 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -436,6 +436,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted return updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,callback); } + // + class CMemcpyUpstreamingDataProducer final : public IUpstreamingDataProducer + { + public: + inline uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override + { + memcpy(dst,reinterpret_cast(data)+offsetInRange,blockSize); + return blockSize; + } + + const void* data; + }; //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`. //! Returns same as `updateBufferRangeViaStagingBuffer` with a callback instead of a pointer, make sure to submit with `nextSubmit.popSubmit()` after this function returns. //! Parameters: @@ -448,25 +460,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted template requires std::is_same_v,SIntendedSubmitInfo> inline bool updateBufferRangeViaStagingBuffer(IntendedSubmitInfo&& nextSubmit, const asset::SBufferRange& bufferRange, const void* data) { - // We check the guarantees of our documentation with the asserts while we're at it -#ifdef _NBL_DEBUG - size_t prevRangeEnd = 0; -#endif - - auto retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,wrapUpstreamingDataProducerLambda( - [&](void* dst, const size_t offsetInRange, const uint32_t blockSize) -> uint32_t - { -#ifdef _NBL_DEBUG - assert(offsetInRange==prevRangeEnd); - prevRangeEnd = offsetInRange+blockSize; -#endif - memcpy(dst,reinterpret_cast(data)+offsetInRange,blockSize); - return blockSize; - } - )); -#ifdef _NBL_DEBUG - assert(prevRangeEnd==bufferRange.size); -#endif + CMemcpyUpstreamingDataProducer memcpyCb; + memcpyCb.data = data; + bool retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,memcpyCb); return retval; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 8f0edb1056..2dddc74f77 100755 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -162,7 +162,7 @@ set(NBL_ASSET_SOURCES # Shaders ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVOptimizer.cpp - ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVDebloater.cpp + ${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/IShaderCompiler.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/CGLSLCompiler.cpp ${NBL_ROOT_PATH}/src/nbl/asset/utils/CHLSLCompiler.cpp @@ -308,7 +308,7 @@ endif() set(COMMON_INCLUDE_DIRS ${THIRD_PARTY_SOURCE_DIR}/glm - ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header + ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header ${CMAKE_BINARY_DIR}/3rdparty/zlib #for dynamically generated zconf.h $ #for dynamically generated pnglibconf.h $ #for dynamically generated jconfig.h @@ -324,7 +324,6 @@ set(NBL_LIBRARY_CREATION_SOURCES ${NABLA_SRCS_COMMON} ${NABLA_HEADERS} $ - $ $ $ $ @@ -351,7 +350,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(NBL_DYNAMIC_MSVC_RUNTIME) +if(NBL_COMPILER_DYNAMIC_RUNTIME) set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") else() set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") @@ -359,6 +358,13 @@ endif() target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__) +target_link_options(Nabla INTERFACE # proxy to downstream targets + $<$: + $<$:/DELAYLOAD:$> + /DELAYLOAD:dxcompiler.dll + > +) + if (ANDROID) add_library(android_native_app_glue STATIC ${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c @@ -391,6 +397,14 @@ if(_NBL_BUILD_DPL_) target_link_libraries(Nabla INTERFACE tbb tbbmalloc tbbmalloc_proxy) endif() +# bzip2 +if(NBL_STATIC_BUILD) + target_link_libraries(Nabla INTERFACE bz2_static) +else() + target_link_libraries(Nabla PRIVATE bz2_static) +endif() +add_dependencies(Nabla bz2_static) + # boost target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}") diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp index 03724be1a2..7137edcba5 100644 --- a/src/nbl/asset/ICPUDescriptorSet.cpp +++ b/src/nbl/asset/ICPUDescriptorSet.cpp @@ -108,36 +108,4 @@ core::smart_refctd_ptr ICPUDescriptorSet::clone(uint32_t _depth) const return cp; } -IAsset* ICPUDescriptorSet::getDependant_impl(size_t ix) -{ - for (auto i=0u; i(IDescriptor::E_TYPE::ET_COUNT); i++) - if (m_descriptorInfos[i]) - { - const auto size = m_descriptorInfos[i]->size(); - if (ixoperator[](ix).desc.get(); - if (desc) - switch (IDescriptor::GetTypeCategory(static_cast(i))) - { - case IDescriptor::EC_BUFFER: - return static_cast(desc); - case IDescriptor::EC_SAMPLER: - return static_cast(desc); - case IDescriptor::EC_IMAGE: - return static_cast(desc); - case IDescriptor::EC_BUFFER_VIEW: - return static_cast(desc); - case IDescriptor::EC_ACCELERATION_STRUCTURE: - return static_cast(desc); - default: - break; - } - return nullptr; - } - else - ix -= size; - } - return nullptr; -} } \ No newline at end of file diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp index 8b43c676b7..4ac78066a7 100644 --- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp +++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp @@ -3,6 +3,8 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/asset/utils/CSPIRVIntrospector.h" + +#include "nbl/asset/ICPUPipeline.h" #include "nbl/asset/utils/spvUtils.h" #include "nbl_spirv_cross/spirv_parser.hpp" @@ -106,15 +108,15 @@ static CSPIRVIntrospector::CStageIntrospectionData::VAR_TYPE spvcrossType2E_TYPE } } -core::smart_refctd_ptr CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout/* = nullptr*/) +core::smart_refctd_ptr CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr&& layout/* = nullptr*/) { - if (info.stage!=IShader::E_SHADER_STAGE::ESS_COMPUTE || info.valid()==IPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO) + if (info.valid()==ICPUPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO) return nullptr; CStageIntrospectionData::SParams params; params.entryPoint = info.entryPoint; params.shader = core::smart_refctd_ptr(info.shader); - params.stage = info.stage; + params.stage = hlsl::ShaderStage::ESS_COMPUTE; auto introspection = introspect(params); @@ -174,15 +176,13 @@ core::smart_refctd_ptr CSPIRVIntrospector::createApproximat layout = pplnIntrospectData->createApproximatePipelineLayoutFromIntrospection(introspection); } - ICPUComputePipeline::SCreationParams pplnCreationParams; - pplnCreationParams.layout = layout.get(); - pplnCreationParams.shader = info; - pplnCreationParams.layout = layout.get(); - return ICPUComputePipeline::create(pplnCreationParams); + auto pipeline = ICPUComputePipeline::create(layout.get()); + pipeline->getSpecInfos(hlsl::ShaderStage::ESS_COMPUTE)[0] = info; + return pipeline; } // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants) -NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants) +NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants) { if (!stageData) return false; @@ -218,7 +218,7 @@ NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRV if (specConstantFound == specConstants->end()) return false; - descInfo.count = specConstantFound->second; + descInfo.count = (specConstantFound->second.size() != 0); } else { diff --git a/src/nbl/asset/utils/ISPIRVDebloater.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp similarity index 90% rename from src/nbl/asset/utils/ISPIRVDebloater.cpp rename to src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp index f05e9d70f5..de78d2b162 100644 --- a/src/nbl/asset/utils/ISPIRVDebloater.cpp +++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp @@ -1,4 +1,4 @@ -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/asset/utils/ISPIRVOptimizer.h" #include "nbl_spirv_cross/spirv.hpp" @@ -10,15 +10,14 @@ using namespace nbl::asset; static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_6; -ISPIRVDebloater::ISPIRVDebloater() +ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer() { constexpr auto optimizationPasses = std::array{ - ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, - ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS, ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM, ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS, ISPIRVOptimizer::EOP_ELIM_DEAD_VARIABLES, ISPIRVOptimizer::EOP_ELIM_DEAD_CONSTANTS, + ISPIRVOptimizer::EOP_AGGRESSIVE_DCE, ISPIRVOptimizer::EOP_ELIM_DEAD_MEMBERS, ISPIRVOptimizer::EOP_TRIM_CAPABILITIES, }; @@ -78,7 +77,7 @@ static bool validate(const uint32_t* binary, uint32_t binarySize, nbl::system::l return core.Validate(binary, binarySize, validatorOptions); } -ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger) const +ISPIRVEntryPointTrimmer::Result ISPIRVEntryPointTrimmer::trim(const ICPUBuffer* spirvBuffer, const core::set& entryPoints, system::logger_opt_ptr logger) const { const auto* spirv = static_cast(spirvBuffer->getPointer()); const auto spirvDwordCount = spirvBuffer->getSize() / 4; @@ -134,7 +133,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, std::vector minimizedSpirv; core::unordered_set removedEntryPointIds; - bool needDebloat = false; + bool needtrim = false; auto offset = HEADER_SIZE; auto parse_instruction = [](uint32_t instruction) -> std::tuple { @@ -185,16 +184,16 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, foundEntryPoint += 1; // a valid spirv will have unique entry points, so this should works } else { - if (needDebloat == false) + if (needtrim == false) { minimizedSpirv.reserve(spirvDwordCount); minimizedSpirv.insert(minimizedSpirv.end(), spirv, spirv + curOffset); - needDebloat = true; + needtrim = true; } removedEntryPointIds.insert(curEntryPointId); continue; } - if (!needDebloat) continue; + if (!needtrim) continue; minimizedSpirv.insert(minimizedSpirv.end(), spirv + curOffset, spirv + offset); } @@ -208,7 +207,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, }; } - if (!needDebloat) + if (!needtrim) { return { .spirv = nullptr, @@ -236,22 +235,22 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const ICPUBuffer* spirvBuffer, assert(validate(minimizedSpirv.data(), minimizedSpirv.size(), logger)); - auto debloatedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger); + auto trimmedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger); #ifdef _NBL_DEBUG logger.log("Before stripping capabilities:", nbl::system::ILogger::ELL_DEBUG); printCapabilities(spirv, spirvDwordCount, logger); logger.log("\n", nbl::system::ILogger::ELL_DEBUG); - const auto* debloatedSpirvBuffer = static_cast(debloatedSpirv->getPointer()); - const auto debloatedSpirvDwordCount = debloatedSpirv->getSize() / 4; + const auto* trimmedSpirvBuffer = static_cast(trimmedSpirv->getPointer()); + const auto trimmedSpirvDwordCount = trimmedSpirv->getSize() / 4; logger.log("After stripping capabilities:", nbl::system::ILogger::ELL_DEBUG); - printCapabilities(debloatedSpirvBuffer, debloatedSpirvDwordCount, logger); + printCapabilities(trimmedSpirvBuffer, trimmedSpirvDwordCount, logger); logger.log("\n", nbl::system::ILogger::ELL_DEBUG); #endif return { - .spirv = std::move(debloatedSpirv), + .spirv = std::move(trimmedSpirv), .isSuccess = true, }; diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 9333a0d3b4..a3d15744a7 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -330,6 +330,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl") +#subgroup2 +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability_impl.hlsl") #shared header between C++ and HLSL LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h") #workgroup @@ -341,6 +345,13 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl") +#workgroup2 +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/virtual_wg_size_def.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/items_per_invoc_def.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/arithmetic_config_def.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl") #Extensions LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl") @@ -361,7 +372,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropi LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl") #tgmath LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl") diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake index 05f9618203..0653ff97a2 100644 --- a/src/nbl/builtin/utils.cmake +++ b/src/nbl/builtin/utils.cmake @@ -39,7 +39,7 @@ endmacro() # _NAMESPACE_ is a C++ namespace builtin resources will be wrapped into # _OUTPUT_INCLUDE_SEARCH_DIRECTORY_ is an absolute path to output directory for builtin resources header files which will be a search directory for generated headers outputed to ${_OUTPUT_HEADER_DIRECTORY_}/${_NAMESPACE_PREFIX_} where namespace prefix is the namespace turned into a path # _OUTPUT_SOURCE_DIRECTORY_ is an absolute path to output directory for builtin resources source files -# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_DYNAMIC_MSVC_RUNTIME which is not an argument of this function +# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_COMPILER_DYNAMIC_RUNTIME which is not an argument of this function # # As an example one could list a resource as following # LIST_BUILTIN_RESOURCE(SOME_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp") @@ -208,12 +208,8 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH "${_OUTPUT_HEADER_DIRECTORY_}" ) set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20) - - if(NBL_DYNAMIC_MSVC_RUNTIME) - set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") - else() - set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") - endif() + + set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>$<$:DLL>") set(NBL_BUILTIN_RESOURCES ${NBL_BUILTIN_RESOURCES}) # turn builtin resources paths list into variable diff --git a/src/nbl/device/DeviceGen.py b/src/nbl/device/DeviceGen.py index 288732de9b..9ad485fc84 100644 --- a/src/nbl/device/DeviceGen.py +++ b/src/nbl/device/DeviceGen.py @@ -562,7 +562,7 @@ def buildTraitsHeader(**params): res.append(emptyline) if 'enable_jit' in params and params['enable_jit']: - res.append("std::string jit_traits = R\"===(") + res.append("std::ostringstream oss;") buildTraitsHeaderHelper( res, @@ -582,7 +582,7 @@ def buildTraitsHeader(**params): ) if 'enable_jit' in params and params['enable_jit']: - res.append(")===\";") + res.append("std::string jit_traits = oss.str();") return res diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py index b910d1aa8f..88174cb3c2 100644 --- a/src/nbl/device/gen.py +++ b/src/nbl/device/gen.py @@ -120,7 +120,7 @@ args.jit_traits_output_path, buildTraitsHeader, type="JIT Members", - template="NBL_CONSTEXPR_STATIC_INLINE {} {} = )===\" + std::string(\"({})\") + CJITIncludeLoader::to_string({}.{}) + R\"===(;", + template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";", limits_json=limits, features_json=features, format_params=["type", "name", "type", "json_type", "cpp_name"], diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp index b40c7155be..f477e96cdf 100644 --- a/src/nbl/ext/ImGui/ImGui.cpp +++ b/src/nbl/ext/ImGui/ImGui.cpp @@ -342,17 +342,13 @@ core::smart_refctd_ptr UI::createPipeline(SCreation core::smart_refctd_ptr pipeline; { - const IPipelineBase::SShaderSpecInfo specs[] = - { - {.shader = shaders.vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX}, - {.shader = shaders.fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT} - }; IGPUGraphicsPipeline::SCreationParams params[1]; { auto& param = params[0u]; + param.vertexShader = { .shader = shaders.vertex.get(), .entryPoint = "VSMain" }; + param.fragmentShader = { .shader = shaders.fragment.get(), .entryPoint = "PSMain" }; param.layout = pipelineLayout.get(); - param.shaders = specs; param.renderpass = creationParams.renderpass.get(); param.cached = { .vertexInput = vertexInputParams, .primitiveAssembly = primitiveAssemblyParams, .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = creationParams.subpassIx }; }; diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp index edab1c046a..1fcbcb0505 100644 --- a/src/nbl/video/CJITIncludeLoader.cpp +++ b/src/nbl/video/CJITIncludeLoader.cpp @@ -20,7 +20,6 @@ auto CJITIncludeLoader::getInclude(const system::path& searchPath, const std::st std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& limits, const SPhysicalDeviceFeatures& features) { #include "nbl/video/device_capabilities_traits_jit.h" - std::string start = R"===( #ifndef _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_ #define _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_ @@ -49,4 +48,4 @@ std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& li return start + jit_traits + end; } -} \ No newline at end of file +} diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h index 6b94f9cad7..4c0d67eee1 100644 --- a/src/nbl/video/CVulkanAccelerationStructure.h +++ b/src/nbl/video/CVulkanAccelerationStructure.h @@ -54,21 +54,6 @@ class CVulkanTopLevelAccelerationStructure final : public CVulkanAccelerationStr using Base::Base; }; - -//! all these utilities cannot be nested because of the complex inheritance between `IGPUAccelerationStructure` and the Vulkan classes -inline VkCopyAccelerationStructureModeKHR getVkCopyAccelerationStructureModeFrom(const IGPUAccelerationStructure::COPY_MODE in) -{ - return static_cast(in); -} -inline VkCopyAccelerationStructureInfoKHR getVkCopyAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyInfo& copyInfo) -{ - VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; - info.src = *reinterpret_cast(copyInfo.src->getNativeHandle()); - info.dst = *reinterpret_cast(copyInfo.dst->getNativeHandle()); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} - template concept Buffer = is_any_of_v,IGPUBuffer,asset::ICPUBuffer>; @@ -91,24 +76,6 @@ inline DeviceOrHostAddress getVkDeviceOrHostAddress(const asset::SBu } return addr; } -template -inline VkCopyAccelerationStructureToMemoryInfoKHR getVkCopyAccelerationStructureToMemoryInfoFrom(const IGPUAccelerationStructure::CopyToMemoryInfo& copyInfo) -{ - VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; - info.src = *reinterpret_cast(copyInfo.src->getNativeHandle()); - info.dst = getVkDeviceOrHostAddress(copyInfo.dst); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} -template -inline VkCopyMemoryToAccelerationStructureInfoKHR getVkCopyMemoryToAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyFromMemoryInfo& copyInfo) -{ - VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; - info.src = getVkDeviceOrHostAddress(copyInfo.src); - info.dst = *reinterpret_cast(copyInfo.dst->getNativeHandle()); - info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode); - return info; -} inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS in) { @@ -118,7 +85,7 @@ inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerati // The srcAccelerationStructure, dstAccelerationStructure, and mode members of pBuildInfo are ignored. Any VkDeviceOrHostAddressKHR members of pBuildInfo are ignored by this command static const VkDeviceOrHostAddressConstKHR NullAddress = { 0x0ull }; template -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase) { static const VkDeviceOrHostAddressConstKHR DummyNonNullAddress = { 0xdeadbeefBADC0FFEull }; @@ -129,7 +96,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.indexType); - outBase.geometry.triangles.indexData = QueryOnly ? NullAddress:getVkDeviceOrHostAddress(triangles.indexData); + outBase.geometry.triangles.indexData = triangles.indexType==asset::E_INDEX_TYPE::EIT_UNKNOWN || QueryOnly ? NullAddress:getVkDeviceOrHostAddress(triangles.indexData); // except that the hostAddress member of VkAccelerationStructureGeometryTrianglesDataKHR::transformData will be examined to check if it is NULL. if (!triangles.hasTransform()) outBase.geometry.triangles.transformData = NullAddress; @@ -137,7 +104,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles(triangles.transform); @@ -145,9 +112,9 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion) { - getVkASGeometryFrom(triangles,outBase); + getVkASGeometryFrom(triangles,outBase); if (triangles.vertexData[1].buffer) { p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV; @@ -158,7 +125,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles -void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs& aabbs, VkAccelerationStructureGeometryKHR& outBase) +void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs& aabbs, VkAccelerationStructureGeometryKHR& outBase) { outBase = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR,nullptr,VK_GEOMETRY_TYPE_AABBS_KHR}; outBase.geometry.aabbs = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_AABBS_DATA_KHR,nullptr}; @@ -221,7 +188,7 @@ inline VkAccelerationStructureBuildGeometryInfoKHR getVkASBuildGeometryInfo(cons for (auto j=0u; j(src->getNativeHandle()); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR; getFunctionTable().vkCmdCopyAccelerationStructureKHR(m_cmdbuf,&info); return true; } -bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) +bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) { - const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo); + VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = getVkDeviceOrHostAddress(dst); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR; getFunctionTable().vkCmdCopyAccelerationStructureToMemoryKHR(m_cmdbuf,&info); return true; } -bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) +bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) { - const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo); + VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = getVkDeviceOrHostAddress(src); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR; getFunctionTable().vkCmdCopyMemoryToAccelerationStructureKHR(m_cmdbuf,&info); return true; } @@ -661,7 +670,7 @@ bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info .renderArea = info.renderArea, // Implicitly but could be optimizedif needed // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-00902 - .clearValueCount = vk_clearValues.size()/sizeof(VkClearValue), + .clearValueCount = static_cast(vk_clearValues.size()/sizeof(VkClearValue)), // Implicit // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-04962 .pClearValues = vk_clearValues.data() diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h index 99b1c15644..9383585b23 100644 --- a/src/nbl/video/CVulkanCommandBuffer.h +++ b/src/nbl/video/CVulkanCommandBuffer.h @@ -177,9 +177,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer return true; } - bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) override; - bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) override; - bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) override; + bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact); + bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst); + bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding& src, IGPUAccelerationStructure* dst); bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override; bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) override; diff --git a/src/nbl/video/CVulkanComputePipeline.h b/src/nbl/video/CVulkanComputePipeline.h index 76fb346e30..89077f9a9a 100644 --- a/src/nbl/video/CVulkanComputePipeline.h +++ b/src/nbl/video/CVulkanComputePipeline.h @@ -15,10 +15,9 @@ class CVulkanComputePipeline final : public IGPUComputePipeline { public: CVulkanComputePipeline( - core::smart_refctd_ptr&& _layout, - const core::bitflag _flags, + const SCreationParams& params, const VkPipeline pipeline - ) : IGPUComputePipeline(std::move(_layout),_flags), m_pipeline(pipeline) {} + ) : IGPUComputePipeline(params), m_pipeline(pipeline) {} inline const void* getNativeHandle() const override { return &m_pipeline; } diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 2bec9e9d06..90b2993cb3 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -40,7 +40,7 @@ CVulkanDeviceMemoryBacked::CVulkanDeviceMemoryBacked( assert(vkHandle!=VK_NULL_HANDLE); } -template CVulkanDeviceMemoryBacked; -template CVulkanDeviceMemoryBacked; +template class CVulkanDeviceMemoryBacked; +template class CVulkanDeviceMemoryBacked; } \ No newline at end of file diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 792ab719eb..9494efc2f2 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -1,6 +1,6 @@ #include "nbl/video/CVulkanLogicalDevice.h" -#include "nbl/asset/utils/ISPIRVDebloater.h" +#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h" #include "nbl/video/CThreadSafeQueueAdapter.h" #include "nbl/video/surface/CSurfaceVulkan.h" @@ -498,21 +498,30 @@ bool CVulkanLogicalDevice::writeAccelerationStructuresProperties_impl(const std: return m_devf.vk.vkWriteAccelerationStructuresPropertiesKHR(m_vkdev,vk_accelerationStructures.size(),vk_accelerationStructures.data(),static_cast(type),stride*accelerationStructures.size(),data,stride); } -auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) -> DEFERRABLE_RESULT { - const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo); + VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } -auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) -> DEFERRABLE_RESULT { - const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo); + VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr }; + info.src = *reinterpret_cast(src->getNativeHandle()); + info.dst = getVkDeviceOrHostAddress(dst); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureToMemoryKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } -auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) -> DEFERRABLE_RESULT +auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) -> DEFERRABLE_RESULT { - const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo); + VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr }; + info.src = getVkDeviceOrHostAddress(src); + info.dst = *reinterpret_cast(dst->getNativeHandle()); + info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR; return getDeferrableResultFrom(m_devf.vk.vkCopyMemoryToAccelerationStructureKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),&info)); } @@ -571,13 +580,13 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc core::smart_refctd_ptr CVulkanLogicalDevice::createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& layout0, - core::smart_refctd_ptr&& layout1, - core::smart_refctd_ptr&& layout2, - core::smart_refctd_ptr&& layout3 + core::smart_refctd_ptr&& layout0, + core::smart_refctd_ptr&& layout1, + core::smart_refctd_ptr&& layout2, + core::smart_refctd_ptr&& layout3 ) { - const core::smart_refctd_ptr tmp[] = { layout0, layout1, layout2, layout3 }; + const core::smart_refctd_ptr tmp[] = { layout0, layout1, layout2, layout3 }; VkDescriptorSetLayout vk_dsLayouts[asset::ICPUPipelineLayout::DESCRIPTOR_SET_COUNT]; uint32_t nonNullSetLayoutCount = ~0u; @@ -1035,7 +1044,9 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createFramebuffer_ // TODO: Change this to pass SPIR-V directly! VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( - const asset::IPipelineBase::SShaderSpecInfo& specInfo, + const video::IGPUPipelineBase::SShaderSpecInfo& specInfo, + hlsl::ShaderStage stage, + bool requireFullSubgroups, VkShaderModuleCreateInfo* &outShaderModule, std::string* &outEntryPoints, VkPipelineShaderStageRequiredSubgroupSizeCreateInfo* &outRequiredSubgroupSize, @@ -1054,8 +1065,6 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( // TODO: VkShaderModuleValidationCacheCreateInfoEXT from VK_EXT_validation_cache // TODO: VkPipelineRobustnessCreateInfoEXT from VK_EXT_pipeline_robustness (allows per-pipeline control of robustness) - const auto stage = specInfo.stage; - (*outEntryPoints) = specInfo.entryPoint; const auto entryPointName = outEntryPoints->c_str(); outEntryPoints++; @@ -1076,8 +1085,8 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( { outSpecMapEntry->constantID = entry.first; outSpecMapEntry->offset = std::distance(specDataBegin,outSpecData); - outSpecMapEntry->size = entry.second.size; - memcpy(outSpecData,entry.second.data,outSpecMapEntry->size); + outSpecMapEntry->size = entry.second.size(); + memcpy(outSpecData, entry.second.data(), outSpecMapEntry->size); outSpecData += outSpecMapEntry->size; outSpecMapEntry++; } @@ -1098,7 +1107,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( outShaderModule++; // Implicit: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754 - using subgroup_size_t = std::remove_reference_t::SUBGROUP_SIZE; + using subgroup_size_t = asset::IPipelineBase::SUBGROUP_SIZE; if (specInfo.requiredSubgroupSize>=subgroup_size_t::REQUIRE_4) { *ppNext = outRequiredSubgroupSize; @@ -1110,7 +1119,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom( else retval.flags = 0; - if (specInfo.requireFullSubgroups) + if (requireFullSubgroups) { assert(stage==hlsl::ShaderStage::ESS_COMPUTE/*TODO: Or Mesh Or Task*/); retval.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT; @@ -1141,7 +1150,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast(pipelineCache)->getInternalObject():VK_NULL_HANDLE; @@ -1168,7 +1177,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl( { initPipelineCreateInfo(outCreateInfo,info); const auto& spec = info.shader; - outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); + outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); outCreateInfo++; } auto vk_pipelines = reinterpret_cast(output); @@ -1182,12 +1191,11 @@ void CVulkanLogicalDevice::createComputePipelines_impl( // break the lifetime cause of the aliasing std::uninitialized_default_construct_n(output+i,1); output[i] = core::make_smart_refctd_ptr( - core::smart_refctd_ptr(info.layout), - info.flags,vk_pipeline + info,vk_pipeline ); debugNameBuilder.str(""); const auto& specInfo = createInfos[i].shader; - debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << specInfo.stage << ")\n"; + debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << hlsl::ShaderStage::ESS_COMPUTE << ")\n"; } } else @@ -1198,7 +1206,7 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { auto getVkStencilOpStateFrom = [](const asset::SStencilOpParams& params)->VkStencilOpState @@ -1300,14 +1308,20 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( { initPipelineCreateInfo(outCreateInfo,info); outCreateInfo->pStages = outShaderStage; - for (const auto& spec : info.shaders) + auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage) { if (spec.shader) { - *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); - outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages, outShaderStage); + *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData); + outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages, outShaderStage); } - } + }; + processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX); + processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); + processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); + processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); + processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); + // when dealing with mesh shaders, the vertex input and assembly state will be null { { @@ -1342,17 +1356,13 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( } outCreateInfo->pInputAssemblyState = outInputAssembly++; } - for (const auto& spec : info.shaders) - if (spec.shader) + + if (info.tesselationControlShader.shader || info.tesselationEvaluationShader.shader) { - const auto stage = spec.stage; - if (stage==hlsl::ShaderStage::ESS_TESSELLATION_CONTROL || stage==hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION) - { - outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount; - outCreateInfo->pTessellationState = outTessellation++; - break; - } + outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount; + outCreateInfo->pTessellationState = outTessellation++; } + const auto& raster = info.cached.rasterization; { outViewport->viewportCount = raster.viewportCount; @@ -1432,16 +1442,22 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl( { for (size_t i=0ull; i(createInfos[i],vk_pipeline); debugNameBuilder.str(""); - for (const auto& shader: createInfos[i].shaders) + auto buildDebugName = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage stage) { - if (shader.shader != nullptr) - debugNameBuilder <getFilepathHint() << "(" << shader.entryPoint << "," << shader.stage << ")\n"; - } + if (spec.shader != nullptr) + debugNameBuilder <getFilepathHint() << "(" << spec.entryPoint << "," << stage << ")\n"; + }; + buildDebugName(createInfo.vertexShader, hlsl::ESS_VERTEX); + buildDebugName(createInfo.tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL); + buildDebugName(createInfo.tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION); + buildDebugName(createInfo.geometryShader, hlsl::ESS_GEOMETRY); + buildDebugName(createInfo.fragmentShader, hlsl::ESS_FRAGMENT); output[i]->setObjectDebugName(debugNameBuilder.str().c_str()); } } @@ -1453,12 +1469,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) { - using SShaderGroupParams = asset::IRayTracingPipelineBase::SShaderGroupsParams; - using SGeneralShaderGroup = asset::IRayTracingPipelineBase::SGeneralShaderGroup; - using SHitShaderGroup = asset::IRayTracingPipelineBase::SHitShaderGroup; + using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams; + using SHitShaderGroup = IGPURayTracingPipeline::SHitGroup; const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR }; const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { @@ -1471,9 +1486,44 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast(pipelineCache)->getInternalObject():VK_NULL_HANDLE; + struct ShaderModuleKey + { + const asset::IShader* shader; + std::string_view entryPoint; + bool operator==(const ShaderModuleKey& other) const = default; + + struct HashFunction + { + size_t operator()(const ShaderModuleKey& key) const + { + size_t rowHash = std::hash()(key.shader); + size_t colHash = std::hash()(key.entryPoint) << 1; + return rowHash ^ colHash; + } + }; + }; size_t maxShaderStages = 0; for (const auto& info : createInfos) - maxShaderStages += info.shaders.size(); + { + core::unordered_set shaderModules; + shaderModules.insert({ info.shaderGroups.raygen.shader, info.shaderGroups.raygen.entryPoint }); + for (const auto& miss : info.shaderGroups.misses) + { + shaderModules.insert({ miss.shader, miss.entryPoint }); + } + for (const auto& hit : info.shaderGroups.hits) + { + shaderModules.insert({ hit.closestHit.shader, hit.closestHit.entryPoint }); + shaderModules.insert({ hit.anyHit.shader, hit.anyHit.entryPoint }); + shaderModules.insert({ hit.intersection.shader, hit.intersection.entryPoint }); + } + for (const auto& callable : info.shaderGroups.callables) + { + shaderModules.insert({ callable.shader, callable.entryPoint }); + } + + maxShaderStages += shaderModules.size(); + } size_t maxShaderGroups = 0; for (const auto& info : createInfos) maxShaderGroups += info.shaderGroups.getShaderGroupCount(); @@ -1498,52 +1548,85 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl( auto outSpecInfo = vk_specializationInfos.data(); auto outSpecMapEntry = vk_specializationMapEntry.data(); auto outSpecData = specializationData.data(); - auto getVkShaderIndex = [](uint32_t index) { return index == SShaderGroupParams::SIndex::Unused ? VK_SHADER_UNUSED_KHR : index; }; - auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SGeneralShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR + + for (const auto& info : createInfos) { - return { - .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, - .pNext = nullptr, - .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR, - .generalShader = getVkShaderIndex(group.index), - .closestHitShader = VK_SHADER_UNUSED_KHR, - .anyHitShader = VK_SHADER_UNUSED_KHR, - .intersectionShader = VK_SHADER_UNUSED_KHR, + + core::unordered_map shaderIndexes; + auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec) + { + const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint }; + const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key]; + return index; }; - }; - auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR - { - return { - .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, - .pNext = nullptr, - .type = group.intersection == SShaderGroupParams::SIndex::Unused ? - VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR, - .generalShader = VK_SHADER_UNUSED_KHR, - .closestHitShader = getVkShaderIndex(group.closestHit), - .anyHitShader = getVkShaderIndex(group.anyHit), - .intersectionShader = getVkShaderIndex(group.intersection), + + auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR + { + return { + .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, + .pNext = nullptr, + .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR, + .generalShader = getVkShaderIndex({spec.shader, spec.entryPoint}), + .closestHitShader = VK_SHADER_UNUSED_KHR, + .anyHitShader = VK_SHADER_UNUSED_KHR, + .intersectionShader = VK_SHADER_UNUSED_KHR, + }; }; - }; - for (const auto& info : createInfos) - { + auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR + { + return { + .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR, + .pNext = nullptr, + .type = group.intersection.shader == nullptr ? + VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR, + .generalShader = VK_SHADER_UNUSED_KHR, + .closestHitShader = getVkShaderIndex(group.closestHit), + .anyHitShader = getVkShaderIndex(group.anyHit), + .intersectionShader = getVkShaderIndex(group.intersection), + }; + }; + initPipelineCreateInfo(outCreateInfo,info); outCreateInfo->pStages = outShaderStage; - for (const auto& specInfo : info.shaders) + auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage) { - *(outShaderStage++) = getVkShaderStageCreateInfoFrom(specInfo, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); - } - outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages,outShaderStage); - assert(outCreateInfo->stageCount != 0); + if (!spec.shader) return; + const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint }; + if (shaderIndexes.find(key) == shaderIndexes.end()) + { + shaderIndexes.insert({ key , std::distancepStages)>(outCreateInfo->pStages, outShaderStage)}); + *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData); + outShaderStage++; + } + }; const auto& shaderGroups = info.shaderGroups; outCreateInfo->pGroups = outShaderGroup; + processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroups.raygen); + for (const auto& shaderGroup : shaderGroups.misses) + { + processSpecInfo(shaderGroup, hlsl::ESS_MISS); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + for (const auto& shaderGroup : shaderGroups.hits) + { + processSpecInfo(shaderGroup.closestHit, hlsl::ESS_CLOSEST_HIT); + processSpecInfo(shaderGroup.anyHit, hlsl::ESS_ANY_HIT); + processSpecInfo(shaderGroup.intersection, hlsl::ESS_INTERSECTION); *(outShaderGroup++) = getHitVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + for (const auto& shaderGroup : shaderGroups.callables) + { + processSpecInfo(shaderGroup, hlsl::ESS_CALLABLE); *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup); + } + + outCreateInfo->stageCount = std::distancepStages)>(outCreateInfo->pStages,outShaderStage); + assert(outCreateInfo->stageCount != 0); outCreateInfo->groupCount = 1 + shaderGroups.hits.size() + shaderGroups.misses.size() + shaderGroups.callables.size(); outCreateInfo->maxPipelineRayRecursionDepth = info.cached.maxRecursionDepth; if (info.cached.dynamicStackSize) diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 93d45dcc32..4cc633ec55 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -133,57 +133,53 @@ class CVulkanLogicalDevice final : public ILogicalDevice // acceleration structure modifiers inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( - const core::bitflag flags, const bool motionBlur, - const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts + const bool hostBuild, const core::bitflag flags, const bool motionBlur, + const std::span> geometries, const uint32_t* const pMaxPrimitiveCounts ) const override { - return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts); + return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts); } template inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl_impl_impl( - const core::bitflag flags, const bool motionBlur, + const bool hostBuild, const core::bitflag flags, const bool motionBlur, const std::span geometries, const uint32_t* const pMaxPrimitiveCounts ) const { - constexpr bool IsAABB = std::is_same_v>; + constexpr bool IsTriangle = Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles; core::vector vk_geometries(geometries.size()); - core::vector vk_triangleMotions(IsAABB ? 0u:geometries.size()); + core::vector vk_triangleMotions(IsTriangle ? geometries.size():0u); auto outTriangleMotions = vk_triangleMotions.data(); for (auto i=0u; i(geometries[i],vk_geometries[i]); - else + if constexpr (IsTriangle) getVkASGeometryFrom(geometries[i],vk_geometries[i],outTriangleMotions); + else + getVkASGeometryFrom(geometries[i],vk_geometries[i]); } - return getAccelerationStructureBuildSizes_impl_impl( - std::is_same_v,false, - getVkASBuildFlagsFrom(flags,motionBlur), - vk_geometries,pMaxPrimitiveCounts - ); + return getAccelerationStructureBuildSizes_impl_impl(hostBuild,false,getVkASBuildFlagsFrom(flags,motionBlur),vk_geometries,pMaxPrimitiveCounts); } AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl( @@ -263,16 +259,16 @@ class CVulkanLogicalDevice final : public ILogicalDevice return getDeferrableResultFrom(m_devf.vk.vkBuildAccelerationStructuresKHR(m_vkdev,static_cast(deferredOperation)->getInternalObject(),infoCount,vk_buildGeomsInfos.data(),vk_ppBuildRangeInfos)); } bool writeAccelerationStructuresProperties_impl(const std::span accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) override; - DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) override; - DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) override; - DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) override; + DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) override; + DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding& dst) override; + DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding& src, IGPUAccelerationStructure* dst) override; // layouts core::smart_refctd_ptr createDescriptorSetLayout_impl(const std::span bindings, const uint32_t maxSamplersCount) override; core::smart_refctd_ptr createPipelineLayout_impl( const std::span pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3 ) override; // descriptor sets @@ -289,20 +285,20 @@ class CVulkanLogicalDevice final : public ILogicalDevice IGPUPipelineCache* const pipelineCache, const std::span createInfos, core::smart_refctd_ptr* const output, - const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; void createGraphicsPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; void createRayTracingPipelines_impl( IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output, - const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation + const SSpecializationValidationResult& validation ) override; // queries diff --git a/src/nbl/video/CVulkanPipelineLayout.h b/src/nbl/video/CVulkanPipelineLayout.h index d89d2a493c..ef46226fdb 100644 --- a/src/nbl/video/CVulkanPipelineLayout.h +++ b/src/nbl/video/CVulkanPipelineLayout.h @@ -15,8 +15,8 @@ class CVulkanPipelineLayout : public IGPUPipelineLayout public: CVulkanPipelineLayout( const ILogicalDevice* dev, const std::span _pcRanges, - core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, - core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3, + core::smart_refctd_ptr&& _layout0, core::smart_refctd_ptr&& _layout1, + core::smart_refctd_ptr&& _layout2, core::smart_refctd_ptr&& _layout3, const VkPipelineLayout vk_layout ) : IGPUPipelineLayout( core::smart_refctd_ptr(dev), diff --git a/src/nbl/video/CVulkanRayTracingPipeline.cpp b/src/nbl/video/CVulkanRayTracingPipeline.cpp index a107d3bbed..960d78428a 100644 --- a/src/nbl/video/CVulkanRayTracingPipeline.cpp +++ b/src/nbl/video/CVulkanRayTracingPipeline.cpp @@ -15,17 +15,17 @@ namespace nbl::video ShaderGroupHandleContainer&& shaderGroupHandles) : IGPURayTracingPipeline(params), m_vkPipeline(vk_pipeline), + m_shaderGroupHandles(std::move(shaderGroupHandles)), m_missStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.misses.size())), m_hitGroupStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())), - m_callableStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())), - m_shaderGroupHandles(std::move(shaderGroupHandles)) + m_callableStackSizes(core::make_refctd_dynamic_array(params.shaderGroups.hits.size())) { const auto* vulkanDevice = static_cast(getOriginDevice()); auto* vk = vulkanDevice->getFunctionTable(); - auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, uint32_t shaderIx, VkShaderGroupShaderKHR shaderType) -> uint16_t + auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, const asset::IShader* shader, VkShaderGroupShaderKHR shaderType) -> uint16_t { - if (shaderIx == SShaderGroupsParams::SIndex::Unused) + if (shader == nullptr) return 0; return vk->vk.vkGetRayTracingShaderGroupStackSizeKHR( @@ -36,14 +36,17 @@ namespace nbl::video ); }; - m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.index, VK_SHADER_GROUP_SHADER_GENERAL_KHR); + m_callableGroupCount = params.shaderGroups.callables.size(); + m_missGroupCount = params.shaderGroups.misses.size(); + m_hitGroupCount = params.shaderGroups.hits.size(); + m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); for (size_t shaderGroupIx = 0; shaderGroupIx < params.shaderGroups.misses.size(); shaderGroupIx++) { m_missStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize( getMissBaseIndex(), shaderGroupIx, - params.shaderGroups.misses[shaderGroupIx].index, + params.shaderGroups.misses[shaderGroupIx].shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); } @@ -52,9 +55,9 @@ namespace nbl::video const auto& hitGroup = params.shaderGroups.hits[shaderGroupIx]; const auto baseIndex = getHitBaseIndex(); m_hitGroupStackSizes->operator[](shaderGroupIx) = SHitGroupStackSize{ - .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR), - .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR), - .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR), + .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit.shader, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR), + .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit.shader,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR), + .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection.shader, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR), }; } @@ -63,7 +66,7 @@ namespace nbl::video m_callableStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize( getCallableBaseIndex(), shaderGroupIx, - params.shaderGroups.callables[shaderGroupIx].index, + params.shaderGroups.callables[shaderGroupIx].shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR); } } @@ -83,19 +86,19 @@ namespace nbl::video std::span CVulkanRayTracingPipeline::getMissHandles() const { const auto baseIndex = getMissBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missGroupCount); } std::span CVulkanRayTracingPipeline::getHitHandles() const { const auto baseIndex = getHitBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitGroupCount); } std::span CVulkanRayTracingPipeline::getCallableHandles() const { const auto baseIndex = getCallableBaseIndex(); - return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableShaderGroups->size()); + return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableGroupCount); } uint16_t CVulkanRayTracingPipeline::getRaygenStackSize() const @@ -159,13 +162,13 @@ namespace nbl::video uint32_t CVulkanRayTracingPipeline::getHitBaseIndex() const { // one raygen group + miss groups before this groups - return 1 + m_missShaderGroups->size(); + return 1 + m_missGroupCount; } uint32_t CVulkanRayTracingPipeline::getCallableBaseIndex() const { // one raygen group + miss groups + hit groups before this groups - return 1 + m_missShaderGroups->size() + m_hitShaderGroups->size(); + return 1 + m_missGroupCount + m_hitGroupCount; } } diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp index b975742436..828ba309b8 100644 --- a/src/nbl/video/IGPUAccelerationStructure.cpp +++ b/src/nbl/video/IGPUAccelerationStructure.cpp @@ -5,7 +5,7 @@ namespace nbl::video { -template +template requires (!std::is_const_v && std::is_base_of_v) bool IGPUAccelerationStructure::BuildInfo::invalid(const IGPUAccelerationStructure* const src, const IGPUAccelerationStructure* const dst) const { // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresIndirectKHR-dstAccelerationStructure-03800 @@ -61,7 +61,7 @@ bool IGPUAccelerationStructure::BuildInfo::invalid(const IGPUAcceler //extern template class IGPUAccelerationStructure::BuildInfo; -template +template requires (!std::is_const_v && std::is_base_of_v) template// requires nbl::is_any_of_v,uint32_t,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>,IGPUBottomLevelAccelerationStructure::BuildRangeInfo> uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const { @@ -139,11 +139,11 @@ uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(cons retval += geometryCount*MaxBuffersPerGeometry; return retval; } -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const uint32_t* const) const; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const uint32_t* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const uint32_t* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const uint32_t* const) const; using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const BuildRangeInfo* const) const; -template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::template valid(const BuildRangeInfo* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const BuildRangeInfo* const) const; +template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo::valid(const BuildRangeInfo* const) const; bool IGPUBottomLevelAccelerationStructure::validVertexFormat(const asset::E_FORMAT format) const { diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp index 3e776782fc..1f619666ab 100644 --- a/src/nbl/video/IGPUCommandBuffer.cpp +++ b/src/nbl/video/IGPUCommandBuffer.cpp @@ -235,8 +235,8 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo #endif // _NBL_DEBUG return false; } -template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; -template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; +template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; +template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo&) const; bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& depInfo) { @@ -842,30 +842,27 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span) { const auto blasCount = info.trackedBLASes.size(); - if (blasCount) - m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast(oit-blasCount),blasCount}; - else - m_TLASToBLASReferenceSets[info.dstAS] = {}; + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit-blasCount,blasCount},.dst=info.dstAS}); } } return totalGeometries; } -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const ); -template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( +template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common( const std::span, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const ); - -bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -888,10 +885,16 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructur } m_noCommands = false; - return copyAccelerationStructure_impl(copyInfo); + const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact); + if constexpr (std::is_same_v) + m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst}); + return retval; } +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUBottomLevelAccelerationStructure::CopyInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUTopLevelAccelerationStructure::CopyInfo&); -bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -911,10 +914,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAcceleration } m_noCommands = false; - return copyAccelerationStructureToMemory_impl(copyInfo); + const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst); + if constexpr (std::is_same_v) + m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes}); + return retval; } +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&); -bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) +template requires std::is_base_of_v +bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE)) return false; @@ -934,8 +943,24 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerati } m_noCommands = false; - return copyAccelerationStructureFromMemory_impl(copyInfo); + const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst); + if constexpr (std::is_same_v) + { + const auto size = copyInfo.trackedBLASes.size(); + auto oit = reserveReferences(size); + if (oit) + { + m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=copyInfo.dst}); + for (const auto& blas : copyInfo.trackedBLASes) + *(oit++) = core::smart_refctd_ptr(blas); + } + else + NBL_LOG_ERROR("out of host memory for BLAS tracking references, TLAS will be copied from memory without BLAS tracking data!"); + } + return retval; } +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); +template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&); bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline) @@ -1661,8 +1686,8 @@ bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding(const asset::SBufferBinding&, const uint32_t, uint32_t); -template bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding&, const uint32_t, uint32_t); template requires nbl::is_any_of_v bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding& indirectBinding, const asset::SBufferBinding& countBinding, const uint32_t maxDrawCount, const uint32_t stride) @@ -1680,8 +1705,8 @@ bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); -template bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); +template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding&, const asset::SBufferBinding&, const uint32_t, const uint32_t); bool IGPUCommandBuffer::drawIndirect(const asset::SBufferBinding& binding, const uint32_t drawCount, const uint32_t stride) { @@ -2078,22 +2103,18 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer* return executeCommands_impl(count,cmdbufs); } -bool IGPUCommandBuffer::recordReferences(const std::span refs) +core::smart_refctd_ptr* IGPUCommandBuffer::reserveReferences(const uint32_t size) { if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT|queue_flags_t::SPARSE_BINDING_BIT)) - return false; + return nullptr; - auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,refs.size()); + auto cmd = m_cmdpool->m_commandListPool.emplace(m_commandList,size); if (!cmd) { NBL_LOG_ERROR("out of host memory!"); - return false; + return nullptr; } - auto oit = cmd->getVariableCountResources(); - for (const auto& ref : refs) - *(oit++) = core::smart_refctd_ptr(ref); - - return true; + return cmd->getVariableCountResources(); } } \ No newline at end of file diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 26cfc4c6a8..983daed190 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -7,50 +7,61 @@ using namespace nbl; using namespace nbl::video; -static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span shaderSpecs, core::vector>& outShaders, asset::IPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr) +class SpirvTrimTask { - using EntryPoints = core::set; - core::map entryPointsMap; - - // collect all entry points first before we debloat - for (const auto& shaderSpec : shaderSpecs) { - const auto* shader = shaderSpec.shader; - auto it = entryPointsMap.find(shader); - if (it == entryPointsMap.end() || it->first != shader) - it = entryPointsMap.emplace_hint(it, shader, EntryPoints()); - it->second.insert({ .name = shaderSpec.entryPoint, .stage = shaderSpec.stage }); - } + public: + using EntryPoints = core::set; + struct ShaderInfo + { + EntryPoints entryPoints; + const asset::IShader* trimmedShader; + }; - core::map debloatedShaders; - for (const auto& shaderSpec: shaderSpecs) - { - const auto* shader = shaderSpec.shader; - const auto& entryPoints = entryPointsMap[shader]; + SpirvTrimTask(asset::ISPIRVEntryPointTrimmer* trimer, system::logger_opt_ptr logger) : m_trimmer(trimer), m_logger(logger) + { + + } + + void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, const hlsl::ShaderStage stage) + { + const auto* shader = shaderSpec.shader; + auto it = m_shaderInfoMap.find(shader); + if (it == m_shaderInfoMap.end() || it->first != shader) + it = m_shaderInfoMap.emplace_hint(it, shader, ShaderInfo{ EntryPoints(), nullptr } ); + it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage }); + } - auto debloatedShaderSpec = shaderSpec; - if (shader != nullptr) + IGPUPipelineBase::SShaderSpecInfo trim(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector>& outShaders) { - if (!debloatedShaders.contains(shader)) + const auto* shader = shaderSpec.shader; + auto findResult = m_shaderInfoMap.find(shader); + assert(findResult != m_shaderInfoMap.end()); + const auto& entryPoints = findResult->second.entryPoints; + auto& trimmedShader = findResult->second.trimmedShader; + + auto trimmedShaderSpec = shaderSpec; + if (shader != nullptr) { - const auto outShadersData = outShaders.data(); - outShaders.push_back(debloater.debloat(shader, entryPoints, logger)); - assert(outShadersData == outShaders.data()); - debloatedShaders.emplace(shader, outShaders.back().get()); + if (trimmedShader == nullptr) + { + outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger)); + trimmedShader = outShaders.back().get(); + } + trimmedShaderSpec.shader = trimmedShader; } - const auto debloatedShader = debloatedShaders[shader]; - debloatedShaderSpec.shader = debloatedShader; + return trimmedShaderSpec; } - *outShaderSpecInfos = debloatedShaderSpec; - - outShaderSpecInfos++; - } - -} + + private: + core::map m_shaderInfoMap; + asset::ISPIRVEntryPointTrimmer* m_trimmer; + const system::logger_opt_ptr m_logger; +}; ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc) : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet), m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr), - m_spirvDebloater(core::make_smart_refctd_ptr()) + m_spirvTrimmer(core::make_smart_refctd_ptr()) { { uint32_t qcnt = 0u; @@ -781,16 +792,8 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span params, core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - IGPUComputePipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool - { - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755 - if (info.requiredSubgroupSize>=asset::IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(info.stage)) - { - NBL_LOG_ERROR("Invalid shader stage"); - return false; - } - return true; - }); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache, params); + if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -798,17 +801,20 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac } core::vector newParams(params.begin(), params.end()); - const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) - { - return sum + param.getShaders().size(); - }); - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); + const auto shaderCount = params.size(); + + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + trimmedShaders.reserve(shaderCount); for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; - debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, &newParams[ix].shader, m_logger); + + const core::set entryPoints = { asset::ISPIRVEntryPointTrimmer::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} }; + trimmedShaders.push_back(m_spirvTrimmer->trim(ci.shader.shader, entryPoints, m_logger)); + auto trimmedShaderSpec = ci.shader; + trimmedShaderSpec.shader = trimmedShaders.back().get(); + newParams[ix].shader = trimmedShaderSpec; } createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation); @@ -834,14 +840,7 @@ bool ILogicalDevice::createGraphicsPipelines( ) { std::fill_n(output, params.size(), nullptr); - IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params, - [this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool - { - if (info.stage != hlsl::ShaderStage::ESS_VERTEX) - return true; - return info.shader; - } - ); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params); if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -853,17 +852,35 @@ bool ILogicalDevice::createGraphicsPipelines( core::vector newParams(params.begin(), params.end()); const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { - return sum + param.getShaders().size(); + return sum + param.getShaderCount(); }); - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); - - core::vector debloatedShaderSpecs(shaderCount); - auto outShaderSpecs = debloatedShaderSpecs.data(); + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + trimmedShaders.reserve(shaderCount); for (auto ix = 0u; ix < params.size(); ix++) { const auto& ci = params[ix]; + + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704 + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705 + if (ci.tesselationControlShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationControlShader.shader); + return false; + } + + if (ci.tesselationEvaluationShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationEvaluationShader.shader); + return false; + } + + if (ci.geometryShader.shader) + { + NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", ci.geometryShader.shader); + return false; + } + auto renderpass = ci.renderpass; if (!renderpass->wasCreatedBy(this)) { @@ -953,9 +970,19 @@ bool ILogicalDevice::createGraphicsPipelines( } } } + + SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger); + trimTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX); + trimTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL); + trimTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION); + trimTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY); + trimTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT); - newParams[ix].shaders = std::span(outShaderSpecs, ci.getShaders().size()); - debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, outShaderSpecs, m_logger); + newParams[ix].vertexShader = trimTask.trim(ci.vertexShader, trimmedShaders); + newParams[ix].tesselationControlShader = trimTask.trim(ci.tesselationControlShader, trimmedShaders); + newParams[ix].tesselationEvaluationShader = trimTask.trim(ci.tesselationEvaluationShader, trimmedShaders); + newParams[ix].geometryShader = trimTask.trim(ci.geometryShader, trimmedShaders); + newParams[ix].fragmentShader = trimTask.trim(ci.fragmentShader, trimmedShaders); } createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation); @@ -980,10 +1007,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline core::smart_refctd_ptr* const output) { std::fill_n(output,params.size(),nullptr); - IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool - { - return true; - }); + SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params); if (!specConstantValidation) { NBL_LOG_ERROR("Invalid parameters were given"); @@ -1004,6 +1028,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline const bool skipAABBs = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_AABBS); const bool skipBuiltin = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_BUILT_IN_PRIMITIVES); + if (!features.rayTracingPipeline) + { + NBL_LOG_ERROR("Raytracing Pipeline feature not enabled!"); + return {}; + } + // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-rayTraversalPrimitiveCulling-03597 if (skipAABBs && skipBuiltin) { @@ -1028,15 +1058,28 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline } core::vector newParams(params.begin(), params.end()); - const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + core::vector> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling + + const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.misses.size(); + }); + const auto hitGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) + { + return sum + param.shaderGroups.hits.size(); + }); + const auto callableGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param) { - return sum + param.getShaders().size(); + return sum + param.shaderGroups.callables.size(); }); - core::vector> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling - debloatedShaders.reserve(shaderCount); - core::vector debloatedShaderSpecs(shaderCount); - auto outShaderSpecs = debloatedShaderSpecs.data(); + + core::vector trimmedMissSpecs(missGroupCount); + auto trimmedMissSpecData = trimmedMissSpecs.data(); + core::vector trimmedHitSpecs(hitGroupCount); + auto trimmedHitSpecData = trimmedHitSpecs.data(); + core::vector trimmedCallableSpecs(callableGroupCount); + auto trimmedCallableSpecData = trimmedCallableSpecs.data(); const auto& limits = getPhysicalDeviceLimits(); for (auto ix = 0u; ix < params.size(); ix++) @@ -1050,14 +1093,47 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline NBL_LOG_ERROR("Invalid maxRecursionDepth. maxRecursionDepth(%u) exceed the limits(%u)", param.cached.maxRecursionDepth, limits.maxRayRecursionDepth); return false; } - if (param.getShaders().empty()) + + SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger); + trimTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN); + for (const auto& miss : param.shaderGroups.misses) + trimTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS); + for (const auto& hit : param.shaderGroups.hits) { - NBL_LOG_ERROR("Pipeline must have at least one shader."); - return false; + trimTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT); + trimTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT); + trimTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION); } + for (const auto& callable : param.shaderGroups.callables) + trimTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE); + + newParams[ix] = param; + newParams[ix].shaderGroups.raygen = trimTask.trim(param.shaderGroups.raygen, trimmedShaders); - newParams[ix].shaders = std::span(outShaderSpecs, param.getShaders().size()); - debloatShaders(*m_spirvDebloater.get(), param.getShaders(), debloatedShaders, outShaderSpecs, m_logger); + newParams[ix].shaderGroups.misses = trimmedMissSpecs; + for (const auto& miss: param.shaderGroups.misses) + { + *trimmedMissSpecData = trimTask.trim(miss, trimmedShaders); + trimmedMissSpecData++; + } + + newParams[ix].shaderGroups.hits = trimmedHitSpecs; + for (const auto& hit: param.shaderGroups.hits) + { + *trimmedHitSpecData = { + .closestHit = trimTask.trim(hit.closestHit, trimmedShaders), + .anyHit = trimTask.trim(hit.anyHit, trimmedShaders), + .intersection = trimTask.trim(hit.intersection, trimmedShaders), + }; + trimmedHitSpecData++; + } + + newParams[ix].shaderGroups.callables = trimmedCallableSpecs; + for (const auto& callable: param.shaderGroups.callables) + { + *trimmedCallableSpecData = trimTask.trim(callable, trimmedShaders); + trimmedCallableSpecData++; + } } createRayTracingPipelines_impl(pipelineCache, newParams,output,specConstantValidation); diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp index e761b7a733..108f76183c 100644 --- a/src/nbl/video/IQueue.cpp +++ b/src/nbl/video/IQueue.cpp @@ -149,15 +149,66 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) auto outRes = m_resources->data(); for (const auto& sema : info.waitSemaphores) *(outRes++) = smart_ptr(sema.semaphore); + // track our own versions + core::unordered_map m_readTLASVersions; + // get the TLAS BLAS tracking info and assign a pending build version number + for (const auto& cb : info.commandBuffers) + for (const auto& var : cb.cmdbuf->m_TLASTrackingOps) + { + const IGPUTopLevelAccelerationStructure* src = nullptr; + switch (var.index()) + { + case 1: + src = std::get<1>(var).src; + break; + case 2: + src = std::get<2>(var).src; + break; + } + if (src) + m_readTLASVersions.insert({src,src->getPendingBuildVer()}); + } for (const auto& cb : info.commandBuffers) { *(outRes++) = smart_ptr(cb.cmdbuf); - // get the TLAS BLAS tracking info and assign a pending build version number - for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets) + for (const auto& var : cb.cmdbuf->m_TLASTrackingOps) + switch (var.index()) { - const auto tlas = refSet.first; - // in theory could assert no duplicate entries, but thats obvious - m_TLASToBLASReferenceSets[tlas] = { .m_BLASes = {refSet.second.begin(),refSet.second.end()}, .m_buildVer = tlas->registerNextBuildVer()}; + case 0: + { + const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var); + + using iterator = decltype(op.src)::iterator; + m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = op.dst->pushTrackedBLASes({op.src.begin()},{op.src.end()}); + break; + } + case 1: + { + const IGPUCommandBuffer::TLASTrackingCopy& op = std::get<1>(var); + // not sure if even legal, but it would deadlock us + if (op.src==op.dst) + break; + const auto ver = m_readTLASVersions.find(op.src)->second; + // stop multiple threads messing with us + std::lock_guard lk(op.src->m_trackingLock); + const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver); + const std::span emptySpan = {}; + m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = pSrcBLASes ? op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):op.dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end()); + break; + } + case 2: + { + const IGPUCommandBuffer::TLASTrackingRead& op = std::get<2>(var); + const auto ver = m_readTLASVersions.find(op.src)->second; + uint32_t count = op.dst->size(); + op.src->getPendingBuildTrackedBLASes(&count,op.dst->data(),ver); + if (count>op.dst->size()) + cb.cmdbuf->getOriginDevice()->getLogger()->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,op.dst->size()); + break; + } + default: + assert(false); + break; } } // We don't hold the last signal semaphore, because the timeline does as an Event trigger. @@ -170,10 +221,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info) IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other) { - m_TLASToBLASReferenceSets = std::move(other.m_TLASToBLASReferenceSets); + m_TLASOverwrites = std::move(other.m_TLASOverwrites); m_resources = std::move(other.m_resources); m_callback = std::move(other.m_callback); - other.m_TLASToBLASReferenceSets = {}; + other.m_TLASOverwrites.clear(); other.m_resources = nullptr; other.m_callback = {}; return *this; @@ -182,13 +233,9 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr // always exhaustive poll, because we need to get rid of resources ASAP void IQueue::DeferredSubmitCallback::operator()() { - // first update tracking info (needs resources alive) - for (const auto& refSet : m_TLASToBLASReferenceSets) - { - const auto tlas = refSet.first; - const auto& blases = refSet.second.m_BLASes; - tlas->setTrackedBLASes(blases.begin(),blases.end(),refSet.second.m_buildVer); - } + // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now) + for (const auto& build : m_TLASOverwrites) + build.first->clearTrackedBLASes(build.second); // then free all resources m_resources = nullptr; // then execute the callback diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d1615a4637..ad54409da4 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -410,7 +410,7 @@ class AssetVisitor : public CRTP } private: - // there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage` + // there is no `impl()` overload taking `ICPUBottomLevelAccelerationStructure` same as there is no `ICPUmage` inline bool impl(const instance_t& instance, const CAssetConverter::patch_t& userPatch) { const auto blasInstances = instance.asset->getInstances(); @@ -519,8 +519,8 @@ class AssetVisitor : public CRTP if (!layout || !descend(layout,{layout})) return false; const auto& specInfo = asset->getSpecInfo(); - const auto* shader = specInfo.shader; - if (!shader || !descend(shader,{shader},specInfo)) + const auto* shader = specInfo.shader.get(); + if (!shader || !descend(shader,{shader},specInfo, hlsl::ESS_COMPUTE)) return false; return true; } @@ -536,8 +536,8 @@ class AssetVisitor : public CRTP using stage_t = hlsl::ShaderStage; for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT}) { - const auto& specInfo = asset->getSpecInfo(stage); - const auto* shader = specInfo.shader; + const auto& specInfo = *asset->getSpecInfo(stage); + const auto* shader = specInfo.shader.get(); if (!shader) { if (stage==stage_t::ESS_VERTEX) // required @@ -545,7 +545,7 @@ class AssetVisitor : public CRTP CRTP::template nullOptional(); continue; } - if (!descend(shader,{shader},specInfo)) + if (!descend(shader,{shader}, specInfo, stage)) return false; } return true; @@ -570,8 +570,9 @@ class AssetVisitor : public CRTP const IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t storageRangeIx(j); const auto binding = redirect.getBinding(storageRangeIx); const uint32_t count = redirect.getCount(storageRangeIx); - // this is where the descriptors have their flattened place in a unified array - const auto* infos = allInfos.data()+redirect.getStorageOffset(storageRangeIx).data; + // this is where the descriptors have their flattened place in a unified array + const auto storageBaseOffset = redirect.getStorageOffset(storageRangeIx); + const auto* infos = allInfos.data()+storageBaseOffset.data; for (uint32_t el=0u; el(untypedDesc); - if (!descend(tlas,{tlas},type,binding,el)) + if (!descend(tlas,{tlas},type,binding,el,storageBaseOffset)) return false; break; } @@ -1035,25 +1036,19 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base auto argTuple = std::tuple(extraArgs...); const auto& arg0 = std::get<0>(argTuple); // hash the spec info - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { + const auto stage = std::get<1>(argTuple); hasher << arg0.entryPoint; - hasher << arg0.stage; + assert(hlsl::bitCount(stage) == 1); + hasher << stage; hasher << arg0.requiredSubgroupSize; - switch (arg0.stage) + if (!arg0.entries.empty()) { - case hlsl::ShaderStage::ESS_COMPUTE: - hasher << arg0.requireFullSubgroups; - break; - default: - break; - } - if (arg0.entries) - { - for (const auto& specConstant : *arg0.entries) + for (const auto& specConstant : arg0.entries) { hasher << specConstant.first; - hasher.update(specConstant.second.data, specConstant.second.size); + hasher.update(specConstant.second.data(), specConstant.second.size()); } } } @@ -1108,6 +1103,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_thostBuild; hasher << lookup.patch->compactAfterBuild; // finally the contents + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } @@ -1120,6 +1117,7 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_thostBuild; hasher << lookup.patch->compactAfterBuild; + hasher << (lookup.patch->isMotion ? lookup.patch->maxInstances:0u); const auto instances = asset->getInstances(); hasher << instances.size(); AssetVisitor> visitor = { @@ -1186,6 +1184,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t look creationFlags |= create_flags_t::ECF_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT; hasher << creationFlags; // finally the contents + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } @@ -1289,6 +1289,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_tdata(),entry.first.meta->size()); } + if (lookup.asset->getContentHash()==NoContentHash) + return false; hasher << lookup.asset->getContentHash(); return true; } @@ -1303,6 +1305,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_tgetCachedCreationParams(); + hasher << params.requireFullSubgroups; return true; } bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t lookup) @@ -1611,8 +1615,7 @@ template<> class GetDependantVisit : public GetDependantVisitBase { public: - // because of zero access to the lifetime tracking between TLASes and BLASes, do nothing - //core::smart_refctd_ptr* const outBLASes; + CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap; protected: bool descend_impl( @@ -1624,7 +1627,7 @@ class GetDependantVisit : public GetDependant auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - // outBLASes[instanceIndex] = std::move(depObj); + instanceMap->operator[](dep.asset) = std::move(depObj); return true; } }; @@ -1718,16 +1721,14 @@ class GetDependantVisit : public GetDependantVisitBase::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; + ICPUPipelineBase::SShaderSpecInfo specInfo = {}; protected: bool descend_impl( @@ -1743,18 +1744,16 @@ class GetDependantVisit : public GetDependantVisitBase& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage ) { auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - getSpecInfo(inSpecInfo.stage) = { - .shader = depObj.get(), + getSpecInfo() = ICPUPipelineBase::SShaderSpecInfo{ + .shader = depObj, .entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now! - .stage = inSpecInfo.stage, .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, - .requireFullSubgroups = inSpecInfo.requireFullSubgroups, .entries = inSpecInfo.entries }; return true; @@ -1775,7 +1774,7 @@ class GetDependantVisit : public GetDependantVisitBase::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; + std::array::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {}; // optionals (done this way because inheritance chain with templated class hides protected methods) IGPURenderpass* renderpass = nullptr; @@ -1793,18 +1792,16 @@ class GetDependantVisit : public GetDependantVisitBase& user, const CAssetConverter::patch_t& userPatch, - const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo + const instance_t& dep, const CAssetConverter::patch_t& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage ) { auto depObj = getDependant(dep,soloPatch); if (!depObj) return false; - getSpecInfo(inSpecInfo.stage) = { - .shader = depObj.get(), + getSpecInfo(stage) = { + .shader = depObj, .entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now! - .stage = inSpecInfo.stage, .requiredSubgroupSize = inSpecInfo.requiredSubgroupSize, - .requireFullSubgroups = 0, .entries = inSpecInfo.entries }; return true; @@ -1828,8 +1825,6 @@ class GetDependantVisit : public GetDependantVisitBase : public GetDependantVisitBase writes = {}; core::vector infos = {}; - core::vector deferredTLASWrites; + core::vector potentialTLASRewrites = {}; // has to be public because of aggregate init, but its only for internal usage! uint32_t lastBinding; uint32_t lastElement; @@ -1904,15 +1899,8 @@ class GetDependantVisit : public GetDependantVisitBase) - { - deferredTLASWrites.push_back({nullptr,binding.data,element,depObj}); - return true; - } // auto& outInfo = infos.emplace_back(); - outInfo.desc = std::move(depObj); // extra stuff auto argTuple = std::tuple(extraArgs...); if constexpr (std::is_same_v) @@ -1920,10 +1908,18 @@ class GetDependantVisit : public GetDependantVisitBase(argTuple); - outInfo.info.buffer.offset= std::get<0>(argTuple).offset; + outInfo.info.buffer.offset = std::get<0>(argTuple).offset; outInfo.info.buffer.size = std::get<0>(argTuple).size; } } + // mark potential TLAS rewrites (with compaction) so we don't have to scan entire descriptor set for potentially compacted TLASes + if constexpr (std::is_same_v) + if (depObj->getPendingBuildVer()==0) // means not built yet, so compactable by next `convert` run + { + auto storageOffset = std::get<0>(argTuple); + storageOffset.data += element; + potentialTLASRewrites.push_back(storageOffset); + } if constexpr (std::is_same_v) { outInfo.info.image.imageLayout = std::get<0>(argTuple); @@ -1934,25 +1930,12 @@ class GetDependantVisit : public GetDependantVisitBase -struct unique_conversion_t -{ - const AssetType* canonicalAsset = nullptr; - patch_index_t patchIndex = {}; - size_t firstCopyIx : 40 = 0u; - size_t copyCount : 24 = 1u; -}; - -// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs -template -using conversions_t = core::unordered_map>; - // Needed both for reservation and conversion class MetaDeviceMemoryAllocator final { @@ -1985,6 +1968,7 @@ class MetaDeviceMemoryAllocator final if ((memReqs.memoryTypeBits&memoryTypeConstraint)==0) { m_logger.log("Overconstrained the Memory Type Index bitmask %d with %d for %s",system::ILogger::ELL_ERROR,memReqs.memoryTypeBits,memoryTypeConstraint,gpuObj->getObjectDebugName()); + pGpuObj->value = nullptr; return false; } // @@ -2004,6 +1988,7 @@ class MetaDeviceMemoryAllocator final if (!allocation.isValid()) { m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName()); + pGpuObj->value = nullptr; return false; } } @@ -2244,6 +2229,244 @@ class MetaDeviceMemoryAllocator final core::map> allocationRequests; }; +// for dem ReBAR goodies +bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& binding, const size_t length) +{ + assert(binding.isValid()); + const auto* memory = binding.memory; + const auto& mappedRange = memory->getMappedRange(); + return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length; +} + +// +template +struct unique_conversion_t +{ + const AssetType* canonicalAsset = nullptr; + patch_index_t patchIndex = {}; + size_t firstCopyIx : 40 = 0u; + size_t copyCount : 24 = 1u; +}; + +// +inline void setDebugName(const CAssetConverter* conv, IBackendObject* gpuObj, const core::blake3_hash_t& contentHash, const uint64_t uniqueCopyGroupID) +{ + std::ostringstream debugName; + debugName << "Created by Converter "; + debugName << std::hex; + debugName << conv; + debugName << " from Asset with hash "; + for (const auto& byte : contentHash.data) + debugName << uint32_t(byte) << " "; + debugName << "for Group " << uniqueCopyGroupID; + gpuObj->setObjectDebugName(debugName.str().c_str()); +} + +// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs +template +struct conversions_t +{ + public: + // Go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions. + void gather(core::tuple_transform_t& dfsCaches, CAssetConverter::CHashCache* hashCache, const CAssetConverter::CCache* readCache) + { + auto& dfsCache = std::get>(dfsCaches); + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + // compute the hash or look it up if it exists + // We mistrust every dependency such that the eject/update if needed. + // Its really important that the Deduplication gets performed Bottom-Up + auto& contentHash = created.contentHash; + PatchOverride patchOverride(*inputs,dfsCaches,instance.uniqueCopyGroupID); + contentHash = hashCache->hash( + {instance.asset,&created.patch}, + &patchOverride, + /*.mistrustLevel =*/ 1 + ); + // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) + if (contentHash==CAssetConverter::CHashCache::NoContentHash) + { + inputs->logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID); + return; + } + const auto hashAsU64 = reinterpret_cast(contentHash.data); + { + inputs->logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]); + } + // if we have a read cache, lets retry looking the item up! + if (readCache) + { + // We can't look up "near misses" (supersets of patches) because they'd have different hashes + // and we can't afford to split hairs like finding overlapping buffer ranges, etc. + // Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries). + const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID}); + if (found!=readCache->forwardMapEnd()) + { + created.gpuObj = found->second; + inputs->logger.log( + "Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG, + instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + } + // The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content + // SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here! + // Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant` + // An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request. + auto* isPrehashed = dynamic_cast(instance.asset); + if (isPrehashed && isPrehashed->missingContent()) + { + inputs->logger.log( + "PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR, + instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + // then de-duplicate the conversions needed + const patch_index_t patchIx = {static_cast(std::distance(dfsCache.nodes.data(),&created))}; + auto [inSetIt,inserted] = contentHashToCanonical.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); + if (!inserted) + { + // If an element prevented insertion, the patch must be identical! + // Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory. + assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch); + inSetIt->second.copyCount++; + } + } + ); + + // work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort + { + // assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order + auto exclScanConvReqs = [&]()->size_t + { + size_t sum = 0; + for (auto& entry : contentHashToCanonical) + { + entry.second.firstCopyIx = sum; + sum += entry.second.copyCount; + } + return sum; + }; + gpuObjUniqueCopyGroupIDs.resize(exclScanConvReqs()); + // + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + if (created.gpuObj) + return; + auto found = contentHashToCanonical.find(created.contentHash); + // may not find things because of unconverted dummy deps + if (found!=contentHashToCanonical.end()) + gpuObjUniqueCopyGroupIDs[found->second.firstCopyIx++] = instance.uniqueCopyGroupID; + else + { + inputs->logger.log( + "No conversion request made for Asset %p in group %d, its impossible to convert.", + system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID + ); + } + } + ); + // `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form + exclScanConvReqs(); + } + + // we now know the size of out output array + gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size()); + } + + // + template + void assign(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj, const AssetType* asset=nullptr) + { + const auto hashAsU64 = reinterpret_cast(contentHash.data); + if constexpr (GPUObjectWhollyImmutable) // including any deps! + if (copyIx==1) // Only warn once to reduce log spam + inputs->logger.log( + "Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?", + system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + // + if (!gpuObj) + { + inputs->logger.log( + "Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + return; + } + auto output = gpuObjects.data()+copyIx+baseIx; + output->value = std::move(gpuObj); + const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx]; + if constexpr (std::is_same_v || std::is_same_v) + { + const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,output->value.get()); + if (!deferredAllocator->request(output,constrainMask)) + return; + } + + if constexpr (!std::is_same_v) + { + // set debug names on everything + setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID); + } + } + + // Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object) + void propagateToCaches(dfs_cache& dfsCache, CAssetConverter::SReserveResult::staging_cache_t& stagingCache) + { + assert(gpuObjUniqueCopyGroupIDs.empty()); + dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + { + // already found in read cache and not converted + if (created.gpuObj) + return; + + const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; + const auto& contentHash = created.contentHash; + const auto hashAsU64 = reinterpret_cast(contentHash.data); + + auto found = contentHashToCanonical.find(contentHash); + // can happen if deps were unconverted dummies + if (found==contentHashToCanonical.end()) + { + if (contentHash!=CAssetConverter::CHashCache::NoContentHash) + inputs->logger.log( + "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3] + ); + return; + } + // unhashables were not supposed to be added to conversion requests + assert(contentHash!=CAssetConverter::CHashCache::NoContentHash); + + const auto copyIx = found->second.firstCopyIx++; + auto& gpuObj = gpuObjects[copyIx]; + if (!gpuObj) + { + inputs->logger.log( + "Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", + system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset + ); + return; + } + // insert into staging cache + stagingCache.emplace(gpuObj.get(),CAssetConverter::SReserveResult::staging_cache_key{gpuObj.value,typename CAssetConverter::CCache::key_t(contentHash,uniqueCopyGroupID)}); + // propagate back to dfsCache + created.gpuObj = std::move(gpuObj); + } + ); + } + + const CAssetConverter* conv; + const CAssetConverter::SInputs* inputs; + MetaDeviceMemoryAllocator* deferredAllocator; + core::unordered_map> contentHashToCanonical; + core::vector gpuObjUniqueCopyGroupIDs; + core::vector> gpuObjects; +}; + // auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { @@ -2486,289 +2709,157 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // BLAS and TLAS creation is somewhat delayed by buffer creation and allocation struct DeferredASCreationParams { - asset_cached_t storage; - size_t scratchSize : 62 = 0; - size_t motionBlur : 1 = false; - size_t compactAfterBuild : 1 = false; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - size_t inputSize = 0; - uint32_t maxInstanceCount = 0; -#endif + asset_cached_t storage = {}; + uint64_t scratchSize = 0; + uint64_t buildSize = 0; }; core::vector accelerationStructureParams[2]; // Deduplication, Creation and Propagation - auto dedupCreateProp = [&]()->void + auto dedupCreateProp = [&]()->conversions_t { - auto& dfsCache = std::get>(dfsCaches); // This map contains the assets by-hash, identical asset+patch hash the same. - conversions_t conversionRequests; + // It only has entries for GPU objects that need to be created + conversions_t conversionRequests = {this,&inputs,&deferredAllocator}; - // We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions. + // const CCache* readCache = inputs.readCache ? (&std::get>(inputs.readCache->m_caches)):nullptr; - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void + conversionRequests.gather(dfsCaches,retval.m_hashCache.get(),readCache); + + // + GetDependantVisitBase visitBase = { + .inputs = inputs, + .dfsCaches = dfsCaches + }; + + // Dispatch to correct creation of GPU objects + auto& dfsCache = std::get>(dfsCaches); + if constexpr (std::is_same_v) + { + for (auto& entry : conversionRequests.contentHashToCanonical) + for (auto i=0ull; i(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); + } + if constexpr (std::is_same_v) + { + for (auto& entry : conversionRequests.contentHashToCanonical) + for (auto i=0ull; ihash( - {instance.asset,&created.patch}, - &patchOverride, - /*.mistrustLevel =*/ 1 - ); - // failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch) - if (contentHash==CHashCache::NoContentHash) - { - inputs.logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID); - return; - } - const auto hashAsU64 = reinterpret_cast(contentHash.data); + const ICPUBuffer* asset = entry.second.canonicalAsset; + const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; + // + IGPUBuffer::SCreationParams params = {}; + params.size = asset->getSize(); + params.usage = patch.usage; + // concurrent ownership if any + const auto outIx = i+entry.second.firstCopyIx; + const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; + const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch); + params.queueFamilyIndexCount = queueFamilies.size(); + params.queueFamilyIndices = queueFamilies.data(); + // if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset); + } + } + if constexpr (std::is_same_v || std::is_same_v) + { + using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS; + const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT); + const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT); + + constexpr bool IsTLAS = std::is_same_v; + accelerationStructureParams[IsTLAS].resize(conversionRequests.gpuObjects.size()); + for (auto& entry : conversionRequests.contentHashToCanonical) + for (auto i=0ull; ivoid { - inputs.logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]); - } - // if we have a read cache, lets retry looking the item up! - if (readCache) + // account for fragmentation and misalignment + buildSize += hlsl::max(size,minScratchAllocSize)+hlsl::max(minScratchAllocSize,alignment)*2; + }; + ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; + const auto hashAsU64 = reinterpret_cast(entry.first.data); { - // We can't look up "near misses" (supersets of patches) because they'd have different hashes - // and we can't afford to split hairs like finding overlapping buffer ranges, etc. - // Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries). - const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID}); - if (found!=readCache->forwardMapEnd()) - { - created.gpuObj = found->second; - inputs.logger.log( - "Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG, - instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - } - // The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content - // SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here! - // Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant` - // An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request. - auto* isPrehashed = dynamic_cast(instance.asset); - if (isPrehashed && isPrehashed->missingContent()) - { - inputs.logger.log( - "PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR, - instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - // then de-duplicate the conversions needed - const patch_index_t patchIx = {static_cast(std::distance(dfsCache.nodes.data(),&created))}; - auto [inSetIt,inserted] = conversionRequests.emplace(contentHash,unique_conversion_t{.canonicalAsset=instance.asset,.patchIndex=patchIx}); - if (!inserted) - { - // If an element prevented insertion, the patch must be identical! - // Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory. - assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch); - inSetIt->second.copyCount++; - } - } - ); - - // work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort - const auto gpuObjUniqueCopyGroupIDs = [&]()->core::vector - { - core::vector retval; - // now assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order - auto exclScanConvReqs = [&]()->size_t - { - size_t sum = 0; - for (auto& entry : conversionRequests) - { - entry.second.firstCopyIx = sum; - sum += entry.second.copyCount; - } - return sum; - }; - retval.resize(exclScanConvReqs()); - // - dfsCache.for_each([&inputs,&retval,&conversionRequests](const instance_t& instance, dfs_cache::created_t& created)->void - { - if (created.gpuObj) - return; - auto found = conversionRequests.find(created.contentHash); - // may not find things because of unconverted dummy deps - if (found!=conversionRequests.end()) - retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID; - else - { - inputs.logger.log( - "No conversion request made for Asset %p in group %d, its impossible to convert.", - system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID - ); - } - } - ); - // `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form - exclScanConvReqs(); - return retval; - }(); - - core::vector> gpuObjects(gpuObjUniqueCopyGroupIDs.size()); - // Only warn once to reduce log spam - auto assign = [&](const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t::type&& gpuObj)->bool - { - const auto hashAsU64 = reinterpret_cast(contentHash.data); - if constexpr (GPUObjectWhollyImmutable) // including any deps! - if (copyIx==1) - inputs.logger.log( - "Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?", - system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - // - if (!gpuObj) - { - inputs.logger.log( - "Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return false; - } - gpuObjects[copyIx+baseIx].value = std::move(gpuObj); - return true; - }; - - GetDependantVisitBase visitBase = { - .inputs = inputs, - .dfsCaches = dfsCaches - }; - // Dispatch to correct creation of GPU objects - if constexpr (std::is_same_v) - { - for (auto& entry : conversionRequests) - for (auto i=0ull; i(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams())); - } - if constexpr (std::is_same_v) - { - for (auto& entry : conversionRequests) - for (auto i=0ull; igetSize(); - params.usage = patch.usage; - // concurrent ownership if any - const auto outIx = i+entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; - const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,entry.second.canonicalAsset,patch); - params.queueFamilyIndexCount = queueFamilies.size(); - params.queueFamilyIndices = queueFamilies.data(); - // if creation successful, we will upload - assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params))); - } - } - if constexpr (std::is_same_v || std::is_same_v) - { - using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS; - const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT); - const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT); - - constexpr bool IsTLAS = std::is_same_v; - accelerationStructureParams[IsTLAS].resize(gpuObjects.size()); - for (auto& entry : conversionRequests) - for (auto i=0ull; iusesMotion(); - ILogicalDevice::AccelerationStructureBuildSizes sizes = {}; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - // we will need to temporarily store the build input buffers somewhere - size_t inputSize = 0; - { - const auto buildFlags = patch.getBuildFlags(as); if constexpr (IsTLAS) { - AssetVisitor> visitor = { - {visitBase}, - {asset,uniqueCopyGroupID}, - patch - }; - if (!visitor()) - continue; + // TLAS can't check for the BLASes existing yet, because they haven't had their backing buffers allocated yet const auto instanceCount = as->getInstances().size(); sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount); - inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount; + // all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit + const uint64_t worstCaseInstanceSize = motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance); + // worst case approximation is fine here + incrementBuildSize(worstCaseInstanceSize*instanceCount,16); + incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t)); } else { - const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); - // the code here is not pretty, but DRY-ing is of this is for later + const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data(); if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT)) { const auto geoms = as->getAABBGeometries(); - if (patch.hostBuild) - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); - } - else - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); - // TODO: check if the strides need to be aligned to 4 bytes for AABBs - for (const auto& geom : geoms) - if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount) - inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride; - } + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts); + for (const auto& geom : geoms) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) + incrementBuildSize(aabbCount*geom.stride,alignof(float)); } else { - core::map allocationsPerStride; const auto geoms = as->getTriangleGeometries(); - if (patch.hostBuild) - { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); - } - else + sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts); + for (const auto& geom : geoms) + if (const auto triCount=*(pPrimitiveCounts++); triCount) { - const std::span> cpuGeoms = { - reinterpret_cast*>(geoms.data()),geoms.size() - }; - sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts); - // TODO: check if the strides need to be aligned to 4 bytes for AABBs - for (const auto& geom : geoms) - if (const auto triCount=*(pMaxPrimitiveCounts++); triCount) + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1); + uint16_t alignment = hlsl::max(0x1u<(alignof(float),alignment); + } + uint16_t indexSize = 0; + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: + indexSize = sizeof(uint16_t); + break; + case E_INDEX_TYPE::EIT_32BIT: + indexSize = sizeof(uint32_t); + break; + default: + break; + } + if (indexSize) + { + size = core::alignUp(size,indexSize)+triCount*3*indexSize; + alignment = hlsl::max(indexSize,alignment); } + //inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment); + incrementBuildSize(size,alignment); } - for (const auto& entry : allocationsPerStride) - inputSize = core::roundUp(inputSize,entry.first)+entry.first*entry.second; } } } - if (!sizes) + if (buildSize==0 || sizes.buildScratchSize==0) + { + inputs.logger.log( + "Build Size Input is 0 or failed the call to `ILogicalDevice::getAccelerationStructureBuildSizes` for Acceleration Structure %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); continue; -#endif + } + // + incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment); + //inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize); + // we need to save the buffer in a side-channel for later auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i]; // this is where it gets a bit weird, we need to create a buffer to back the acceleration structure @@ -2778,23 +2869,24 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment); params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT; // concurrent ownership if any - const auto outIx = i + entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); out.storage.value = device->createBuffer(std::move(params)); + if (out.storage) + { + nbl::video::setDebugName(this,out.storage.value.get(),entry.first,uniqueCopyGroupID); + if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) + continue; + } } out.scratchSize = sizes.buildScratchSize; - out.motionBlur = motionBlur; - out.compactAfterBuild = patch.compactAfterBuild; - if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes)) - out.storage.value = nullptr; + out.buildSize = buildSize; } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; i SReserveResult } // concurrent ownership if any const auto outIx = i+entry.second.firstCopyIx; - const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx]; + const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx]; const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch); params.queueFamilyIndexCount = queueFamilies.size(); params.queueFamilyIndices = queueFamilies.data(); // gpu image specifics params.tiling = static_cast(patch.linearTiling); params.preinitialized = false; - // if creation successful, we check what queues we need if uploading - if (assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params))) && !asset->getRegions().empty()) - { - // for now until host_image_copy - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - // Best effort guess, without actually looking at all regions - // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 - if (isDepthOrStencilFormat(patch.format) && (patch.usageFlags|patch.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; - // only if we upload some data can we recompute the mips - if (patch.recomputeMips) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; - } + // if creation successful, we will request some memory allocation to bind to + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset); } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUBufferView* asset = entry.second.canonicalAsset; const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -2900,13 +2981,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (!visitor()) continue; // no format promotion for buffer views - assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat())); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat())); } } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUImageView* asset = entry.second.canonicalAsset; const auto& cpuParams = asset->getCreationParameters(); @@ -2914,7 +2995,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -2943,7 +3024,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // if underlying image had mip-chain extended then we extend our own if (imageParams.mipLevels!=visitor.oldMipCount) params.subresourceRange.levelCount = imageParams.mipLevels-params.subresourceRange.baseMipLevel; - assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params))); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params))); } } } @@ -2955,22 +3036,18 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult .writeCache = inputs.writeShaderCache }; - // no one depend on the converted IShaders so we need to hold a smart ptr into them somewhere. - // This is to prevent m_stagingCache to hold a dangling pointer into IShader - retval.m_shaders.reserve(gpuObjUniqueCopyGroupIDs.size()); - - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) for (auto i=0ull; icompileShader(createParams); retval.m_shaders.push_back(shader); - assign(entry.first,entry.second.firstCopyIx,i,std::move(shader)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(shader)); } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUDescriptorSetLayout* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3019,7 +3096,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { { @@ -3031,7 +3108,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; if (!visitor()) continue; - assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings)); } } } @@ -3039,7 +3116,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { core::vector pcRanges; pcRanges.reserve(CSPIRVIntrospector::MaxPushConstantsSize); - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUPipelineLayout* asset = entry.second.canonicalAsset; const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch; @@ -3074,7 +3151,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3083,13 +3160,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (!visitor()) continue; auto layout = device->createPipelineLayout(pcRanges,std::move(visitor.dsLayouts[0]),std::move(visitor.dsLayouts[1]),std::move(visitor.dsLayouts[2]),std::move(visitor.dsLayouts[3])); - assign(entry.first,entry.second.firstCopyIx,i,std::move(layout)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(layout)); } } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUPipelineCache* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3097,20 +3174,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); + conversionRequests.template assign(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false)); } } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUComputePipeline* asset = entry.second.canonicalAsset; // there is no patching possible for this asset for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3120,21 +3197,22 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult continue; // ILogicalDevice::createComputePipelines is rather aggressive on the spec constant validation, so we create one pipeline at a time core::smart_refctd_ptr ppln; + IGPUPipelineBase::SShaderEntryMap entryMap; { // no derivatives, special flags, etc. IGPUComputePipeline::SCreationParams params = {}; params.layout = visitor.layout; // while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE - params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE); + params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), &entryMap); device->createComputePipelines(inputs.pipelineCache,{¶ms,1},&ppln); } - assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); } } } if constexpr (std::is_same_v) { - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPURenderpass* asset = entry.second.canonicalAsset; // there is no patching possible for this asset @@ -3142,22 +3220,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // since we don't have dependants we don't care about our group ID // we create threadsafe pipeline caches, because we have no idea how they may be used - assign.template operator()(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); + conversionRequests.template assign(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters())); } } } if constexpr (std::is_same_v) { - core::vector tmpSpecInfo; - tmpSpecInfo.reserve(5); - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset; // there is no patching possible for this asset for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3170,24 +3246,28 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult { // no derivatives, special flags, etc. IGPUGraphicsPipeline::SCreationParams params = {}; + using SShaderEntryMap = IGPUPipelineBase::SShaderEntryMap; + SShaderEntryMap vertexEntryMap; + SShaderEntryMap tesselationControlEntryMap; + SShaderEntryMap tesselationEvaluationEntryMap; + SShaderEntryMap geometryEntryMap; + SShaderEntryMap fragmentEntryMap; bool depNotFound = false; { params.layout = visitor.layout; params.renderpass = visitor.renderpass; // while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to match the slot here - tmpSpecInfo.clear(); using stage_t = hlsl::ShaderStage; - for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT}) - { - auto& info = visitor.getSpecInfo(stage); - if (info.shader) - tmpSpecInfo.push_back(std::move(info)); - } - params.shaders = tmpSpecInfo; + using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo; + params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap); + params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap); + params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), &tesselationEvaluationEntryMap); + params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), &geometryEntryMap); + params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), &fragmentEntryMap); } params.cached = asset->getCachedCreationParams(); device->createGraphicsPipelines(inputs.pipelineCache,{¶ms,1},&ppln); - assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln)); } } } @@ -3198,13 +3278,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult // Descriptor Pools have large up-front slots reserved for all descriptor types, if we were to merge // multiple descriptor sets to be allocated from one pool, dropping any set wouldn't result in the // reclamation of the memory used, it would at most (with the FREE pool create flag) return to pool. - for (auto& entry : conversionRequests) + for (auto& entry : conversionRequests.contentHashToCanonical) { const ICPUDescriptorSet* asset = entry.second.canonicalAsset; for (auto i=0ull; i> visitor = { {visitBase}, {asset,uniqueCopyGroupID}, @@ -3229,196 +3309,153 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult ds = nullptr; } else - retval.m_deferredTLASDescriptorWrites.insert(visitor.deferredTLASWrites.begin(),visitor.deferredTLASWrites.end()); + for (const auto storageIx : visitor.potentialTLASRewrites) + retval.m_potentialTLASRewrites.insert({ds.get(),storageIx}); } else inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName()); - assign(entry.first,entry.second.firstCopyIx,i,std::move(ds)); + conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ds)); } } } - // Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse - // This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures - if constexpr (!std::is_same_v && !std::is_same_v) - dfsCache.for_each([&](const instance_t& instance, dfs_cache::created_t& created)->void - { - auto& stagingCache = std::get>(retval.m_stagingCaches); - // already found in read cache and not converted - if (created.gpuObj) - return; - - const auto& contentHash = created.contentHash; - auto found = conversionRequests.find(contentHash); - - const auto uniqueCopyGroupID = instance.uniqueCopyGroupID; - - const auto hashAsU64 = reinterpret_cast(contentHash.data); - // can happen if deps were unconverted dummies - if (found==conversionRequests.end()) - { - if (contentHash!=CHashCache::NoContentHash) - inputs.logger.log( - "Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx", - system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] - ); - return; - } - // unhashables were not supposed to be added to conversion requests - assert(contentHash!=CHashCache::NoContentHash); - - const auto copyIx = found->second.firstCopyIx++; - // the counting sort was stable - assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]); - - auto& gpuObj = gpuObjects[copyIx]; - if (!gpuObj) - { - inputs.logger.log( - "Conversion for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.", - system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset - ); - return; - } - // set debug names on everything! - { - std::ostringstream debugName; - debugName << "Created by Converter "; - debugName << std::hex; - debugName << this; - debugName << " from Asset with hash "; - for (const auto& byte : contentHash.data) - debugName << uint32_t(byte) << " "; - debugName << "for Group " << uniqueCopyGroupID; - - // IShader is ethereal not really a persistent gpu object - if constexpr (std::is_base_of_v) - gpuObj.get()->setObjectDebugName(debugName.str().c_str()); - } - // insert into staging cache - stagingCache.emplace(gpuObj.get(),typename CCache::key_t(contentHash,uniqueCopyGroupID)); - // propagate back to dfsCache - created.gpuObj = std::move(gpuObj); - // record if a device memory allocation will be needed - if constexpr (std::is_base_of_v::video_t>) - { - const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,instance.asset,contentHash,created.gpuObj.get()); - if (!deferredAllocator.request(&created.gpuObj,constrainMask)) - { - created.gpuObj.value = nullptr; - return; - } - } - // - if constexpr (std::is_same_v) - retval.m_bufferConversions.emplace_back(SReserveResult::SConvReqBuffer{core::smart_refctd_ptr(instance.asset),created.gpuObj.get()}); - if constexpr (std::is_same_v) - { - const uint16_t recomputeMips = created.patch.recomputeMips; - retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase{core::smart_refctd_ptr(instance.asset),created.gpuObj.get()},recomputeMips); - } -// TODO: BLAS and TLAS requests - } - ); - + // clear what we don't need + if constexpr (!std::is_base_of_v) + conversionRequests.gpuObjUniqueCopyGroupIDs.clear(); + // This gets deferred till AFTER the Buffer Memory Allocations and Binding + if constexpr (!std::is_base_of_v && !std::is_base_of_v::video_t>) + { + conversionRequests.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + return {}; + } + return conversionRequests; }; - // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. - // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. - // If two Asset chains are independent then we order them from most catastrophic failure to least. - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - dedupCreateProp.template operator()(); - // now allocate the memory for buffers and images - deferredAllocator.finalize(); - - // can remove buffers from conversion requests which can be written to directly - { - core::vector flushRanges; - flushRanges.reserve(retval.m_bufferConversions.size()); - std::erase_if(retval.m_bufferConversions,[&flushRanges](const SReserveResult::SConvReqBuffer& conv)->bool - { - const auto boundMemory = conv.gpuObj->getBoundMemory(); - auto* const memory = boundMemory.memory; - if (!boundMemory.memory->isMappable()) - return false; - const size_t size = conv.gpuObj->getSize(); - const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size}; - // slightly inefficient but oh well - void* dst = memory->map(range,IDeviceMemoryAllocation::EMCAF_WRITE); - memcpy(dst,conv.canonical->getPointer(),size); - if (boundMemory.memory->haveToMakeVisible()) - flushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); - return true; - } - ); - if (!flushRanges.empty()) - device->flushMappedMemoryRanges(flushRanges); - if (!retval.m_bufferConversions.empty()) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; - } - // Deal with Deferred Creation of Acceleration structures + // scope so the conversion requests go our of scope early { - for (auto asLevel=0; asLevel<2; asLevel++) + // The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants. + // Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready. + // If two Asset chains are independent then we order them from most catastrophic failure to least. + auto bufferConversions = dedupCreateProp.template operator()(); + auto blasConversions = dedupCreateProp.template operator()(); + auto tlasConversions = dedupCreateProp.template operator()(); + auto imageConversions = dedupCreateProp.template operator()(); + // now allocate the memory for buffers and images + deferredAllocator.finalize(); + + // enqueue successfully created buffers for conversion + for (auto& entry : bufferConversions.contentHashToCanonical) + for (auto i=0ull; i(entry.second.canonicalAsset)}); + assert(inserted); + } + bufferConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + // Deal with Deferred Creation of Acceleration structures { - // each of these stages must have a barrier inbetween - size_t scratchSizeFullParallelBuild = 0; - size_t scratchSizeFullParallelCompact = 0; - // we collect that stats AFTER making sure that the BLAS / TLAS can actually be created - for (const auto& deferredParams : accelerationStructureParams[asLevel]) + auto createAccelerationStructures = [&](conversions_t& requests)->void { - // buffer failed to create/allocate - if (!deferredParams.storage) - continue; -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - IGPUAccelerationStructure::SCreationParams baseParams; - { - auto* buf = deferredParams.storage.get(); - const auto bufSz = buf->getSize(); - using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; - baseParams = { - .bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr(buf)}, - .flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE - }; - } - smart_refctd_ptr as; - if (asLevel) - { - as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); - } + constexpr bool IsTLAS = std::is_same_v; + // + std::conditional_t* pConversions; + if constexpr (IsTLAS) + pConversions = retval.m_tlasConversions; else + pConversions = retval.m_blasConversions; + // we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created + for (auto& entry : requests.contentHashToCanonical) + for (auto i=0ull; icreateTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount}); + const auto reqIx = entry.second.firstCopyIx+i; + if (const auto& deferredParams=accelerationStructureParams[IsTLAS][reqIx]; deferredParams.storage) + { + const auto* canonical = entry.second.canonicalAsset; + const auto& dfsNode = std::get>(dfsCaches).nodes[entry.second.patchIndex.value]; + const auto& patch = dfsNode.patch; + // create the AS + const auto bufSz = deferredParams.storage.get()->getSize(); + IGPUAccelerationStructure::SCreationParams baseParams; + { + using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS; + baseParams = { + .bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value}, + .flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE + }; + } + smart_refctd_ptr::video_t> as; + CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap; + if constexpr (IsTLAS) + { + // check if the BLASes we want to use for the instances were successfully allocated and created + AssetVisitor> visitor = { + {inputs,dfsCaches,&blasInstanceMap}, + {canonical,requests.gpuObjUniqueCopyGroupIDs[reqIx]}, + patch + }; + if (!visitor()) + { + const auto hashAsU64 = reinterpret_cast(entry.first.data); + inputs.logger.log( + "Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx", + system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3] + ); + continue; + } + as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances}); + } + else + as = device->createBottomLevelAccelerationStructure(std::move(baseParams)); + if (!as) + { + inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR); + continue; + } + // file the request for conversion + auto& request = pConversions[patch.hostBuild][as.get()]; + request.canonical = smart_refctd_ptr(canonical); + request.scratchSize = deferredParams.scratchSize; + request.compact = patch.compactAfterBuild; + request.buildFlags = static_cast(patch.getBuildFlags(canonical).value); + request.buildSize = deferredParams.buildSize; + if constexpr (IsTLAS) + request.instanceMap = std::move(blasInstanceMap); + requests.assign(entry.first,entry.second.firstCopyIx,i,std::move(as)); + } } - // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build -// TODO: compute with alignment - const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize; - // sizes for building 1-by-1 vs parallel, note that - retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize); - scratchSizeFullParallelBuild += buildSize; - // triangles, AABBs or Instance Transforms will need to be supplied from VRAM -#endif + requests.gpuObjUniqueCopyGroupIDs.clear(); + }; + createAccelerationStructures.template operator()(blasConversions); + blasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + createAccelerationStructures.template operator()(tlasConversions); + tlasConversions.propagateToCaches(std::get>(dfsCaches),std::get>(retval.m_stagingCaches)); + } + // enqueue successfully created images with data to upload for conversion + auto& dfsCacheImages = std::get>(dfsCaches); + for (auto& entry : imageConversions.contentHashToCanonical) + for (auto i=0ull; igetRegions().empty()) + { + const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips; + auto [where,inserted] = retval.m_imageConversions.insert({gpuImg.get(),SReserveResult::SConvReqImage{core::smart_refctd_ptr(cpuImg),recomputeMips}}); + assert(inserted); } - // -// retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild,retval.m_maxASBuildScratchSize); } - // - if (retval.willDeviceASBuild()) - retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + imageConversions.propagateToCaches(dfsCacheImages,std::get>(retval.m_stagingCaches)); } - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); - dedupCreateProp.operator()(); -// dedupCreateProp.operator()(); - + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); + dedupCreateProp.template operator()(); +// dedupCreateProp.template operator()(); } // write out results @@ -3445,12 +3482,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult if (const auto& gpuObj=found.gpuObj; gpuObj) { results[i] = gpuObj; +#ifdef _NBL_DEBUG // if something with this content hash is in the stagingCache, then it must match the `found->gpuObj` if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end()) { - const bool matches = finalCacheIt->second==typename CCache::key_t(found.contentHash,uniqueCopyGroupID); + const bool matches = finalCacheIt->second.cacheKey==typename CCache::key_t(found.contentHash,uniqueCopyGroupID); assert(matches); } +#endif } else inputs.logger.log("No GPU Object could be found or created for Root Asset %p in group %d",system::ILogger::ELL_ERROR,asset,uniqueCopyGroupID); @@ -3458,32 +3497,219 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult }; core::for_each_in_tuple(inputs.assets,finalize); - retval.m_converter = core::smart_refctd_ptr(this); - retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr(inputs.logger.get())); - return retval; -} - -// -ISemaphore::future_t CAssetConverter::convert_impl(SReserveResult& reservations, SConvertParams& params) -{ - ISemaphore::future_t retval = IQueue::RESULT::OTHER_ERROR; - system::logger_opt_ptr logger = reservations.m_logger.get().get(); - if (!reservations.m_converter) + // A failed conversion can cause dangling GPU object pointers, and needless work for objects which will die soon after, so prune with a Top-Down pass anything thats not reachable from a root { - logger.log("Cannot call convert on an unsuccessful reserve result! Or are you attempting to do a double run of `convert` ?",system::ILogger::ELL_ERROR); - return retval; - } - assert(reservations.m_converter.get()==this); - auto device = m_params.device; + // we use a genious trick, if someone else is using the GPU object, the refcount must obviously be greater than 1 + auto pruneStaging = [&]()->void + { + auto& stagingCache = std::get>(retval.m_stagingCaches); + phmap::erase_if(stagingCache,[&retval](const auto& entry)->bool + { + if (entry.first->getReferenceCount()==1) + { + // I know what I'm doing, the hashmap is being annoying not letting you look up with const pointer key a non const pointer hashmap + auto* gpuObj = const_cast::video_t*>(entry.first); + if constexpr (std::is_same_v) + retval.m_bufferConversions.erase(gpuObj); + if constexpr (std::is_same_v) + for (auto i=0; i<2; i++) + retval.m_blasConversions[i].erase(gpuObj); + if constexpr (std::is_same_v) + for (auto i=0; i<2; i++) + retval.m_tlasConversions[i].erase(gpuObj); + if constexpr (std::is_same_v) + retval.m_imageConversions.erase(gpuObj); + // TODO: erase from `retval.m_gpuObjects` as well + return true; + } + // still referenced, keep it around + return false; + } + ); + }; + // The order these are called is paramount, the Higher Level User needs to die to let go of dependants and make our Garbage Collection work +// pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + pruneStaging.template operator()(); + } + + // only now get the queue flags + { + using q_fam_f = IQueue::FAMILY_FLAGS; + // acceleration structures, get scratch size + auto computeAccelerationStructureScratchSizes = [device,&retval]()->void + { + constexpr bool IsTLAS = std::is_same_v; + const auto& limits = device->getPhysicalDevice()->getLimits(); + const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment; + // index 0 is device build, 1 is host build + size_t scratchSizeFullParallelBuild[2] = {0,0}; + // + const std::conditional_t* pConversions; + if constexpr (IsTLAS) + pConversions = retval.m_tlasConversions; + else + pConversions = retval.m_blasConversions; + // we collect the stats AFTER making sure only needed TLAS and BLAS will be built + for (auto i=0; i<2; i++) + for (auto req : pConversions[i]) + { + const auto buildSize = req.second.buildSize; + // sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently + retval.m_minASBuildScratchSize[i] = core::max(retval.m_minASBuildScratchSize[i],buildSize); + scratchSizeFullParallelBuild[i] = core::alignUp(scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize; + // note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build + if (req.second.compact) + { + const auto asSize = req.first->getCreationParams().bufferRange.size; + assert(core::is_aligned_to(asSize,256)); + retval.m_compactedASMaxMemory += asSize; + } + } + // TLAS and BLAS can't build concurrently + retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]); + retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]); + }; + computeAccelerationStructureScratchSizes.template operator()(); + computeAccelerationStructureScratchSizes.template operator()(); + if (retval.willDeviceASBuild() || retval.willCompactAS()) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + // images are trickier, we can't finish iterating until all possible flags are there + for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++) + { + const auto boundMemory = it->first->getBoundMemory(); + assert(boundMemory.isValid()); + // Note: with `host_image_copy` this will get conditional + { + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + // Best effort guess, without actually looking at all regions + const auto& params = it->first->getCreationParameters(); + // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739 + if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT)) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT; + if (it->second.recomputeMips) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + } + } + // buffer conversions + for (auto it=retval.m_bufferConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT) && it!=retval.m_bufferConversions.end(); it++) + { + const auto boundMemory = it->first->getBoundMemory(); + assert(boundMemory.isValid()); + if (!canHostWriteToMemoryRange(boundMemory,it->first->getSize())) + retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + } + } + + retval.m_converter = core::smart_refctd_ptr(this); + retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr(inputs.logger.get())); + return retval; +} + +// +ISemaphore::future_t CAssetConverter::convert_impl(SReserveResult& reservations, SConvertParams& params) +{ + ISemaphore::future_t retval = IQueue::RESULT::OTHER_ERROR; + system::logger_opt_ptr logger = reservations.m_logger.get().get(); + if (!reservations.m_converter) + { + logger.log("Cannot call convert on an unsuccessful reserve result! Or are you attempting to do a double run of `convert` ?",system::ILogger::ELL_ERROR); + return retval; + } + assert(reservations.m_converter.get()==this); + auto device = m_params.device; + + auto hostBufferXferIt = reservations.m_bufferConversions.begin(); + core::vector memoryHostFlushRanges; + memoryHostFlushRanges.reserve(reservations.m_bufferConversions.size()); + auto hostUploadBuffers = [&](auto&& pred)->void + { + for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++) + { + IGPUBuffer* buff = hostBufferXferIt->first; + const size_t size = buff->getSize(); + const auto boundMemory = buff->getBoundMemory(); + if (!canHostWriteToMemoryRange(boundMemory,size)) + continue; + auto* const memory = boundMemory.memory; + const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size}; + memcpy(reinterpret_cast(memory->getMappedPointer())+range.offset,hostBufferXferIt->second->getPointer(),size); + // let go of canonical asset (may free RAM) + hostBufferXferIt->second = nullptr; + if (memory->haveToMakeVisible()) + memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); + } + if (!memoryHostFlushRanges.empty()) + { + device->flushMappedMemoryRanges(memoryHostFlushRanges); + memoryHostFlushRanges.clear(); + } + }; + + // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) + core::unordered_map outputReverseMap; + core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void + { + uint32_t i = 0; + for (const auto& gpuObj : gpuObjects) + outputReverseMap[gpuObj.value.get()] = i++; + } + ); + auto markFailure = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr* canonical, typename SReserveResult::staging_cache_t::mapped_type* cacheNode)->void + { + // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around + *canonical = nullptr; + // also drop the smart pointer from the output array so failures release memory quickly + const auto foundIx = outputReverseMap.find(cacheNode->gpuRef.get()); + if (foundIx!=outputReverseMap.end()) + { + auto& resultOutput = std::get>(reservations.m_gpuObjects); + resultOutput[foundIx->second].value = nullptr; + outputReverseMap.erase(foundIx); + } + logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,cacheNode->gpuRef->getObjectDebugName()); + // drop smart pointer + cacheNode->gpuRef = nullptr; + }; + + // want to check if deps successfully exist + struct SMissingDependent + { + // This only checks if whether we had to convert and failed, but the dependent might be in readCache of one or more converters, so if in doubt assume its okay + inline operator bool() const {return wasInStaging && gotWiped;} - // compacted TLASes need to be substituted in cache and Descriptor Sets + bool wasInStaging; + bool gotWiped; + }; + auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->SMissingDependent + { + const auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(dep); + SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()}; + retval.gotWiped = retval.wasInStaging && !found->second.gpuRef; + return retval; + }; + + // Descriptor Sets need their TLAS descriptors substituted if they've been compacted core::unordered_map> compactedTLASMap; // Anything to do? - auto reqQueueFlags = reservations.m_queueFlags; - if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE) + if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { // whether we actually get around to doing that depends on validity and success of transfers - const bool shouldDoSomeCompute = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); + const bool shouldDoSomeCompute = reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT); auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool { if (!intended || !intended->valid()) @@ -3522,13 +3748,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return retval; } using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS; - constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; - // we may use the staging buffer directly to skip an extra copy on small enough geometries - if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags)) - { - logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR); - return retval; - } + constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT; auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); const auto& scratchParams = scratchBuffer->getCachedCreationParams(); @@ -3550,12 +3770,14 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR); return retval; } + // this alignment is probably bigger than required by any Build Input const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment; if (addrAlloc.max_alignment()(scratchBuffer->getBoundMemory().memory->getMappedPointer()); // Need to use Transfer Queue and copy via staging buffer @@ -3569,25 +3791,40 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul const auto transferFamily = params.transfer->queue->getFamilyIndex(); // But don't want to have to do QFOTs between Transfer and Queue Families then if (transferFamily!=computeFamily) - if (!scratchParams.canBeUsedByQueueFamily(transferFamily)) + if (!scratchParams.isConcurrentSharing() || !scratchParams.canBeUsedByQueueFamily(transferFamily)) { logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily); return retval; } - reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; + if (!scratchBuffer->getCreationParams().usage.hasFlags(buffer_usage_f::EUF_TRANSFER_DST_BIT)) + { + logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and doesn't the transfer destination usage flag!",system::ILogger::ELL_ERROR); + return retval; + } + // Right now we copy from staging to scratch, but in the future we may use the staging buffer directly to skip an extra copy on small enough geometries + if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags|buffer_usage_f::EUF_TRANSFER_SRC_BIT)) + { + logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR); + return retval; + } } } // the elusive and exotic host builds - if (reservations.willHostASBuild() && !params.scratchForHostASBuild) + if (reservations.willHostASBuild()) { - logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR); - return retval; + if (!params.scratchForHostASBuild) + { + logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR); + return retval; + } + // TODO: check everything else when we actually support host builds } // and compacting - if (reservations.willCompactAS() && !params.compactedASAllocator) + if (reservations.willCompactAS()) { - logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR); - return retval; + if (!params.compactedASAllocator) + logger.log("Acceleration Structures will be compacted using the ILogicalDevice as the memory allocator!", system::ILogger::ELL_WARNING); + // note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory` } // @@ -3627,40 +3864,6 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return retval; } - // - auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->core::blake3_hash_t* - { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - const auto found = stagingCache.find(const_cast::video_t*>(gpuObj)); - assert(found!=stagingCache.end()); - return const_cast(&found->second.value); - }; - // wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users) - core::unordered_map outputReverseMap; - core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void - { - uint32_t i = 0; - for (const auto& gpuObj : gpuObjects) - outputReverseMap[gpuObj.value.get()] = i++; - } - ); - auto markFailureInStaging = [&reservations,&outputReverseMap,logger](const char* message, smart_refctd_ptr& canonical, const typename asset_traits::video_t* gpuObj, core::blake3_hash_t* hash)->void - { - // wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around - canonical = nullptr; - logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName()); - // change the content hash on the reverse map to a NoContentHash - *hash = CHashCache::NoContentHash; - // also drop the smart pointer from the output array so failures release memory quickly - const auto foundIx = outputReverseMap.find(gpuObj); - if (foundIx!=outputReverseMap.end()) - { - auto& resultOutput = std::get>(reservations.m_gpuObjects); - resultOutput[foundIx->second].value = nullptr; - outputReverseMap.erase(foundIx); - } - }; - // core::bitflag submitsNeeded = IQueue::FAMILY_FLAGS::NONE; @@ -3712,6 +3915,15 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // some state so we don't need to look later auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer->getCommandBufferForRecording():nullptr; + // + auto findInStaging = [&reservations](const typename asset_traits::video_t* gpuObj)->auto + { + auto& stagingCache = std::get>(reservations.m_stagingCaches); + const auto found = stagingCache.find(gpuObj); + assert(found!=stagingCache.end()); + return found; + }; + using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier; // upload Buffers auto& buffersToUpload = reservations.m_bufferConversions; @@ -3719,38 +3931,38 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul core::vector finalReleases; finalReleases.reserve(buffersToUpload.size()); // do the uploads - if (!buffersToUpload.empty()) + if (!buffersToUpload.empty() && xferCmdBuf) { xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers START"); xferCmdBuf->cmdbuf->endDebugMarker(); } for (auto& item : buffersToUpload) { - auto* buffer = item.gpuObj; - const SBufferRange range = { - .offset = 0, - .size = item.gpuObj->getCreationParams().size, - .buffer = core::smart_refctd_ptr(buffer) - }; - auto pFoundHash = findInStaging.template operator()(buffer); + auto* buffer = item.first; + const size_t size = buffer->getCreationParams().size; + // host will upload + if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size)) + continue; + auto pFound = &findInStaging.template operator()(buffer)->second; // - const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily); + const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,pFound->cacheKey.value),transferFamily); if (ownerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",item.canonical,buffer,pFoundHash); + markFailure("invalid Final Queue Family given by user callback",&item.second,pFound); continue; } // do the upload - const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer()); + const SBufferRange range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr(buffer)}; + const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.second->getPointer()); // current recording buffer may have changed xferCmdBuf = params.transfer->getCommandBufferForRecording(); if (!success) { - markFailureInStaging("Data Upload",item.canonical,buffer,pFoundHash); + markFailure("Data Upload",&item.second,pFound); continue; } // let go of canonical asset (may free RAM) - item.canonical = nullptr; + item.second = nullptr; submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT; // enqueue ownership release if necessary if (ownerQueueFamily!=IQueue::FamilyIgnored) @@ -3767,12 +3979,11 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul .range = range }); } - if (!buffersToUpload.empty()) + if (!buffersToUpload.empty() && xferCmdBuf) { xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END"); xferCmdBuf->cmdbuf->endDebugMarker(); } - buffersToUpload.clear(); // release ownership if (!finalReleases.empty()) pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=finalReleases},"Ownership Releases of Buffers Failed"); @@ -3782,7 +3993,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr; - auto drainCompute = [¶ms,&computeCmdBuf](const std::span extraSignal={})->auto + auto drainCompute = [¶ms,shouldDoSomeTransfer,&computeCmdBuf](const std::span extraSignal={})->auto { if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty()) return IQueue::RESULT::SUCCESS; @@ -3790,15 +4001,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& waitSemaphoreSpan = params.compute->waitSemaphores; std::unique_ptr patchedWaits; // the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with - if (waitSemaphoreSpan.empty()) - waitSemaphoreSpan = {¶ms.transfer->scratchSemaphore,1}; - else + if (shouldDoSomeTransfer) { - const auto origCount = waitSemaphoreSpan.size(); - patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]); - std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get()); - patchedWaits[origCount] = params.transfer->scratchSemaphore; - waitSemaphoreSpan = {patchedWaits.get(),origCount+1}; + if (waitSemaphoreSpan.empty()) + waitSemaphoreSpan = {¶ms.transfer->scratchSemaphore,1}; + else + { + const auto origCount = waitSemaphoreSpan.size(); + patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]); + std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get()); + patchedWaits[origCount] = params.transfer->scratchSemaphore; + waitSemaphoreSpan = {patchedWaits.get(),origCount+1}; + } } // don't worry about resetting old `waitSemaphores` because they get cleared to an empty span after overflow submit IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal); @@ -3810,15 +4024,20 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return IQueue::RESULT::OTHER_ERROR; return res; }; - // compose our overflow callback on top of what's already there, only if we need to ofc - auto origXferStallCallback = params.transfer->overflowCallback; - if (shouldDoSomeCompute) - params.transfer->overflowCallback = [&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void + + // We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc + std::function origXferStallCallback; + if (shouldDoSomeTransfer) + { + origXferStallCallback = std::move(params.transfer->overflowCallback); + params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void { drainCompute(); if (origXferStallCallback) origXferStallCallback(tillScratchResettable); + hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;}); }; + } // when overflowing compute resources, we need to submit the Xfer before submitting Compute auto drainBoth = [¶ms,&xferCmdBuf,&drainCompute](const std::span extraSignal={})->auto { @@ -3893,7 +4112,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul return true; }; - // because of the layout transitions + // because of the layout transitions (TODO: conditional when host_image_copy gets implemented) params.transfer->scratchSemaphore.stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; // TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs // Lets analyze sync cases: @@ -3910,9 +4129,9 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul for (auto& item : imagesToUpload) { // basiscs - const auto* cpuImg = item.canonical.get(); - auto* image = item.gpuObj; - auto pFoundHash = findInStaging.template operator()(image); + auto& cpuImg = item.second.canonical; + auto* image = item.first; + auto pFound = &findInStaging.template operator()(image)->second; // get params const auto& creationParams = image->getCreationParameters(); const auto format = creationParams.format; @@ -3930,7 +4149,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul }); IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY; // create Mipmapping source Image View, allocate its place in the descriptor set and write it - if (item.recomputeMips) + if (item.second.recomputeMips) { switch (creationParams.type) { @@ -3962,7 +4181,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView))) { - markFailureInStaging("Source Mip Level Descriptor Write",item.canonical,image,pFoundHash); + markFailure("Source Mip Level Descriptor Write",&cpuImg,pFound); continue; } } @@ -3971,7 +4190,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul { // Transfer and Compute barriers get recorded for image individually (see the TODO why its horrible) // so we only need to worry about QFOTs for current image if they even exist - if (item.recomputeMips && !transferBarriers.empty()) + if (item.second.recomputeMips && !transferBarriers.empty()) { // so now we need a immeidate QFOT Release cause we already recorded some compute mipmapping for current image if (pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Recording QFOT Release from Transfer Queue Family after overflow failed")) @@ -3983,7 +4202,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul } else { - markFailureInStaging("Image QFOT Pipeline Barrier",item.canonical,image,pFoundHash); + markFailure("Image QFOT Pipeline Barrier",&cpuImg,pFound); return false; } return true; @@ -3999,6 +4218,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul computeBarriers.clear(); const bool concurrentSharing = image->getCachedCreationParams().isConcurrentSharing(); uint8_t lvl = 0; + const auto recomputeMipMask = item.second.recomputeMips; bool _prevRecompute = false; for (; lvl CAssetConverter::convert_impl(SReserveResul // if any op, it will always be a release (Except acquisition of first source mip in compute) barrier.ownershipOp = ownership_op_t::RELEASE; // if we're recomputing this mip level - const bool recomputeMip = lvl && (item.recomputeMips&(0x1u<<(lvl-1))); + const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1))); // query final layout from callback - const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl); + const auto finalLayout = params.getFinalLayout(image,pFound->cacheKey.value,lvl); // get region data for upload auto regions = cpuImg->getRegions(lvl); // basic error checks @@ -4042,7 +4262,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul logger.log("What are you doing requesting layout UNDEFINED for mip level % of image %s after Upload or Mip Recomputation!?",system::ILogger::ELL_ERROR,lvl,image->getObjectDebugName()); break; } - const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,*pFoundHash,lvl); + const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,pFound->cacheKey.value,lvl); // if we'll recompute the mipmap, then do the layout transition on the compute queue (there's one less potential QFOT) if (recomputeMip) { @@ -4228,7 +4448,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; // whether next mip will need to read from this one to recompute itself - const bool sourceForNextMipCompute = item.recomputeMips&(0x1u<general transition tmp.newLayout = sourceForNextMipCompute ? layout_t::GENERAL : layout_t::TRANSFER_DST_OPTIMAL; // fire off the pipeline barrier so we can start uploading right away @@ -4297,18 +4517,18 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul // failed in the for-loop if (lvl != creationParams.mipLevels) { - markFailureInStaging("Compute Mip Mapping",item.canonical,image,pFoundHash); + markFailure("Compute Mip Mapping",&cpuImg,pFound); continue; } // let go of canonical asset (may free RAM) - item.canonical = nullptr; + cpuImg = nullptr; } // here we only record barriers that do final layout transitions and release ownership to final queue family if (!transferBarriers.empty()) { if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed")) { - markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash); + markFailure("Image Data Upload Pipeline Barrier",&cpuImg,pFound); continue; } // even if no uploads performed, we do layout transitions on empty images from Xfer Queue @@ -4320,7 +4540,7 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore()); if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed")) { - markFailureInStaging("Compute Mip Mapping Pipeline Barrier",item.canonical,image,pFoundHash); + markFailure("Compute Mip Mapping Pipeline Barrier",&cpuImg,pFound); continue; } } @@ -4344,258 +4564,302 @@ ISemaphore::future_t CAssetConverter::convert_impl(SReserveResul auto& tlasesToBuild = reservations.m_tlasConversions[0]; const auto blasCount = blasesToBuild.size(); const auto tlasCount = tlasesToBuild.size(); - const auto maxASCount = hlsl::max(tlasCount,blasCount); ownershipTransfers.reserve(blasCount+tlasCount); - auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); - core::vector flushRanges; - const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible(); - if (manualFlush) // BLAS builds do max 3 writes each TLAS builds do max 2 writes each - flushRanges.reserve(hlsl::max(blasCount*3,tlasCount*2)); // Right now we build all BLAS first, then all TLAS // (didn't fancy horrible concurrency managment taking compactions into account) auto queryPool = device->createQueryPool({.queryCount=hlsl::max(blasCount,tlasCount),.queryType=IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE}); - const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = { - // the last use of the source BLAS could have been a build or a compaction - .srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT - }; - // lambdas! - auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool - { - if (deviceASBuildScratchPtr) - { - callback(deviceASBuildScratchPtr+offset,0ull,size); - if (manualFlush) - flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); - return true; - } - else if (const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback)) - return true; - else - return false; - }; - // - auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void - { - if (buildInfos.empty()) - return; - // Lets analyze sync cases: - // - Mapped Host write = no barrier, flush & optional submit sufficient - // - Single Queue = Global Memory Barrier - // - Two distinct Queues = no barrier, semaphore signal-wait is sufficient - // - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer ! - bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!"); - // - success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()); - if (!success) - for (const auto& info : buildInfos) - { - const auto pFoundHash = findInStaging.template operator()(info.dstAS); - smart_refctd_ptr dummy; // already null at this point - markFailureInStaging("AS Build Command Recording",dummy,info.dstAS,pFoundHash); - } - buildInfos.clear(); - rangeInfos.clear(); - }; - - // Not messing around with listing AS backing buffers individually, ergonomics of that are null - const asset::SMemoryBarrier readASInASCompactBarrier = { - .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT - }; - - // Device BLAS builds - if (blasCount) - { - core::vector compactions; - // build - { - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); -#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION - constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; - - core::vector buildInfos; buildInfos.reserve(blasCount); - core::vector rangeInfo; rangeInfo.reserve(blasCount); - core::vector> triangles; - core::vector> aabbs; - { - size_t totalTriGeoCount = 0; - size_t totalAABBGeoCount = 0; - for (auto& item : blasToBuild) - { - const size_t geoCount = item.canonical->getGeometryCount(); - if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) - totalAABBGeoCount += geoCount; - else - totalTriGeoCount += geoCount; - } - triangles.reserve(totalTriGeoCount); - triangles.reserve(totalAABBGeoCount); - } - for (auto& item : blasToBuild) + // leftover for TLAS builds + using compacted_blas_map_t = unordered_map>; + compacted_blas_map_t compactedBLASMap; + bool failedBLASBarrier = false; + // returns a map of compacted Acceleration Structures + auto buildAndCompactASes = [&](auto& asesToBuild)->unordered_map> { - auto* as = item.gpuObj; - auto pFoundHash = findInStaging.template operator()(as); - if (item.asBuildParams.host) - { - auto dOp = device->createDeferredOperation(); - // - if (!device->buildAccelerationStructure(dOp.get(),info,range)) - { - markFailureInStaging("BLAS Build Command Recording",item.canonical,gpuObj,pFoundHash); - continue; - } - } - else - { - auto& buildInfo = buildInfo.emplace_back({ - .buildFlags = item.buildFlags, - .geometryCount = item.canonical->getGeometryCount(), - // this is not an update - .srcAS = nullptr, - .dstAS = as.get() - }); - if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag)) - buildInfo.aabbs = nullptr; - else - buildInfo.triangles = nullptr; - computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo); - } - } -#endif - if (!compactions.empty()) - { - // submit cause host needs to read the queries - drainCompute(); - } - // want to launch the BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch so more is available for TLAS builds - else if (tlasCount) - drainCompute(); - blasesToBuild.clear(); - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); - } - // compact - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - { - // the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP -//reservations.m_blasBuildMap[canonical].gpuBLAS = compacted; - } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); - } + const auto asCount = asesToBuild.size(); + if (asCount==0) + return {}; + + constexpr bool IsTLAS = std::is_same_v; + using CPUAccelerationStructure = std::conditional_t; - // Device TLAS builds - if (tlasCount) - { - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build TLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - // A single pipeline barrier to ensure BLASes build before TLASes is needed - const asset::SMemoryBarrier readBLASInTLASBuildBarrier = { - // the last use of the source BLAS could have been a build or a compaction - .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, - .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, - .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, - .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT - }; - // either we built no BLASes (remember we could retrieve already built ones from cache) or we barrier for the previous compactions or builds - const bool failedBLASBarrier = blasCount && !pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!"); - // TLAS compactions to do later core::vector compactions; // 0xffFFffFFu when not releasing ownership, otherwise index into `ownershipTransfers` where the ownership release for the old buffer was core::vector compactedOwnershipReleaseIndices; - compactions.reserve(tlasCount); - compactedOwnershipReleaseIndices.reserve(tlasCount); + compactions.reserve(asCount); + compactedOwnershipReleaseIndices.reserve(asCount); // build { - // - core::vector buildInfos; - buildInfos.reserve(tlasCount); - core::vector rangeInfos; - rangeInfos.reserve(tlasCount); - core::vector> trackedBLASes; - trackedBLASes.reserve(maxASCount); - auto recordBuildCommands = [&]()->void + auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer(); + core::vector flushRanges; + const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible(); + if (deviceASBuildScratchPtr && manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway + flushRanges.reserve(asCount*2); + // lambdas! + auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool { - // rewrite the trackedBLASes pointers - for (auto& info : buildInfos) + if (deviceASBuildScratchPtr) { - const auto offset = info.trackedBLASes.data(); - const auto correctPtr = trackedBLASes.data()+reinterpret_cast(offset); - info.trackedBLASes = {reinterpret_cast(correctPtr),info.trackedBLASes.size()}; + callback(deviceASBuildScratchPtr+offset,0ull,size); + if (manualFlush) + flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag); + return true; + } + else + { + const SBufferRange range={.offset=offset,.size=size,.buffer=smart_refctd_ptr(scratchBuffer)}; + const bool retval = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback); + // current recording buffer may have changed + xferCmdBuf = params.transfer->getCommandBufferForRecording(); + return retval; } - recordBuildCommandsBase(buildInfos,rangeInfos); - trackedBLASes.clear(); }; // + core::vector buildInfos; + buildInfos.reserve(asCount); + using build_range_info_t = std::conditional_t; + core::vector rangeInfos; + rangeInfos.reserve(asCount); using scratch_allocator_t = std::remove_reference_t; using addr_t = typename scratch_allocator_t::size_type; + core::vector allocOffsets; + allocOffsets.reserve(asCount); + core::vector allocSizes; + allocSizes.reserve(asCount); + // BLAS and TLAS specific things + core::vector geometryRangeInfo; + core::vector> triangles; + core::vector> aabbs; + core::vector> trackedBLASes; + if constexpr (IsTLAS) + trackedBLASes.reserve(asCount); + else // would have to count total geometries in BLASes to initialize properly, and we probably don't want to over-reserve + { + geometryRangeInfo.reserve(asCount); + triangles.reserve(asCount); + aabbs.reserve(asCount); + } + // + core::vector alignments; + alignments.reserve(asCount*2); + constexpr auto GeometryIsAABBFlag = IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT; + auto recordBuildCommands = [&]()->void + { + bool success = !buildInfos.empty(); + // Lets analyze sync cases: + // - Mapped Host write = no barrier, flush & optional submit sufficient + // - Single Queue = Global Memory Barrier + // - Two distinct Queues = no barrier, semaphore signal-wait is sufficient + // - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer ! + if (success) + { + const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = { + // the last use of the source BLAS could have been a build or a compaction + .srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT + }; + success = !uniQueue || deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!"); + } + // + constexpr bool IsTLAS = std::is_same_v; + if (success) + { + // rewrite the based pointers + if constexpr (IsTLAS) + for (auto& info : buildInfos) + { + const auto offset = info.trackedBLASes.data(); + const auto correctPtr = trackedBLASes.data()+reinterpret_cast(offset); + info.trackedBLASes = {reinterpret_cast(correctPtr),info.trackedBLASes.size()}; + } + else + { + for (auto& info : buildInfos) + { + if (info.buildFlags.hasFlags(GeometryIsAABBFlag)) + info.aabbs = aabbs.data()+reinterpret_cast(info.aabbs); + else + info.triangles = triangles.data()+reinterpret_cast(info.triangles); + } + for (auto& rangeInfo : rangeInfos) + rangeInfo = geometryRangeInfo.data()+reinterpret_cast(rangeInfo); + } + success = computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data()); + } + // account for the in-progress allocation (we may be called from an overflow submit) + const auto oldAllocCount = allocOffsets.size()-alignments.size(); + if (success) + { + submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT; + // queue up a deferred allocation + if (oldAllocCount) + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore()); + } + else + { + // release right away + if (oldAllocCount) + params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data()); + for (const auto& info : buildInfos) + { + const auto stagingFound = findInStaging.template operator()(info.dstAS); + smart_refctd_ptr dummy; // already null at this point + markFailure("AS Build Command Recording",&dummy,&stagingFound->second); + } + } + allocOffsets.erase(allocOffsets.begin(),allocOffsets.begin()+oldAllocCount); + allocSizes.erase(allocSizes.begin(),allocSizes.begin()+oldAllocCount); + buildInfos.clear(); + rangeInfos.clear(); + if constexpr (IsTLAS) + trackedBLASes.clear(); + else + { + geometryRangeInfo.clear(); + triangles.clear(); + aabbs.clear(); + } + }; + + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures START"); + computeCmdBuf->cmdbuf->endDebugMarker(); const auto& limits = physDev->getLimits(); - core::unordered_set> dedupBLASesUsed; - dedupBLASesUsed.reserve(reservations.m_blasBuildMap.size()); - for (auto& tlasToBuild : tlasesToBuild) + for (auto& asToBuild : asesToBuild) { - dedupBLASesUsed.clear(); - const auto as = tlasToBuild.gpuObj; - const auto pFoundHash = findInStaging.template operator()(as); + auto& canonical = asToBuild.second.canonical; + const auto as = asToBuild.first; + const auto pFound = &findInStaging.template operator()(as)->second; const auto& backingRange = as->getCreationParams().bufferRange; // checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters - const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily); + const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily); if (finalOwnerQueueFamily==QueueFamilyInvalid) { - markFailureInStaging("invalid Final Queue Family given by user callback",tlasToBuild.canonical,as,pFoundHash); + markFailure("invalid Final Queue Family given by user callback",&canonical,pFound); continue; } - const auto instances = tlasToBuild.canonical->getInstances(); - const auto instanceCount = static_cast(instances.size()); - size_t instanceDataSize = 0; - // gather total input size and check dependants exist - for (const auto& instance : instances) + // clean up the allocation if we fail to make it to the end of loop for whatever reason + alignments.clear(); + auto allocCount = 0; + auto deallocSrc = core::makeRAIIExiter([¶ms,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void + { + const auto beginIx = allocSizes.size()-allocCount; + // if got to end of loop queue up the release of memory, otherwise release right away + if (allocCount) + params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx); + allocOffsets.resize(beginIx); + allocSizes.resize(beginIx); + alignments.clear(); + } + ); + allocSizes.push_back(asToBuild.second.scratchSize); + alignments.push_back(limits.minAccelerationStructureScratchOffsetAlignment); + const bitflag buildFlags = asToBuild.second.getBuildFlags(); + if constexpr (IsTLAS) { - // failed BLAS builds erase themselves from this map, so this checks if some BLAS used but which had to be built failed the build - const auto found = reservations.m_blasBuildMap.find(instance.getBase().blas.get()); - if (found==reservations.m_blasBuildMap.end() || failedBLASBarrier && found->second.buildDuringConvertCall) + const auto instances = canonical->getInstances(); + // gather total input size and check dependants exist + size_t instanceDataSize = 0; + bool dependsOnBLASBuilds = false; + const auto& instanceMap = asToBuild.second.instanceMap; + for (const auto& instance : instances) { - instanceDataSize = 0; - break; + auto found = instanceMap.find(instance.getBase().blas.get()); + assert(instanceMap.end()!=found); + const auto depInfo = missingDependent.template operator()(found->second.get()); + if (depInfo) + { + instanceDataSize = 0; + break; + } + if (depInfo.wasInStaging) + dependsOnBLASBuilds = true; + instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); + } + // problem with building some Dependent BLASes + if (failedBLASBarrier && dependsOnBLASBuilds) + { + markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound); + continue; + } + // problem with finding the dependents (BLASes) + if (instanceDataSize==0) + { + markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound); + continue; + } + allocSizes.push_back(instanceDataSize); + alignments.push_back(16); + if (as->usesMotion()) + { + allocSizes.push_back(sizeof(void*)*instances.size()); + alignments.push_back(alignof(uint64_t)); } - instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType()); } - // problem with finding the dependents (BLASes) - if (instanceDataSize==0) + else { - markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",tlasToBuild.canonical,as,pFoundHash); - continue; + const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data(); + if (buildFlags.hasFlags(GeometryIsAABBFlag)) + { + for (const auto& geom : canonical->getAABBGeometries()) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) + { + allocSizes.push_back(aabbCount*geom.stride); + alignments.push_back(alignof(float)); + } + } + else + { + for (const auto& geom : canonical->getTriangleGeometries()) + if (const auto triCount=*(pPrimitiveCounts++); triCount) + { + auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1); + uint16_t alignment = hlsl::max(0x1u<(alignof(float),alignment); + } + uint16_t indexSize = 0u; + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: + indexSize = alignof(uint16_t); + break; + case E_INDEX_TYPE::EIT_32BIT: + indexSize = alignof(uint32_t); + break; + default: + break; + } + if (indexSize) + { + size = core::alignUp(size,indexSize)+triCount*3*indexSize; + alignment = hlsl::max(indexSize,alignment); + } + allocSizes.push_back(size); + alignments.push_back(alignment); + const auto tmp = asToBuild.second.scratchSize; + //logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp); + } + } } - // allocate scratch and build inputs - constexpr uint32_t MaxAllocCount = 3; - addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value}; - const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount}; + allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value); + // allocate out scratch or submit overflow, if fail then flush and keep trying till space is made + auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size(); + const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size(); + //logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0)); + for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++) { - const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8}; -/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us -{ -addr_t worstSize = sizes[0]; -for (auto i=1u; iminScratchSize) - minScratchSize = worstSize; -}*/ - const auto AllocCount = as->usesMotion() ? 2:3; - // if fail then flush and keep trying till space is made - for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++) if (t==1) // don't flush right away cause allocator not defragmented yet { recordBuildCommands(); + // the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes + offsets = allocOffsets.data(); + sizes = allocSizes.data(); // if writing to scratch directly, flush the writes if (!flushRanges.empty()) { @@ -4604,13 +4868,28 @@ if (worstSize>minScratchSize) } drainCompute(); } - // queue up a deferred allocation - params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore()); + // we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator + params.scratchForDeviceASBuild->multi_deallocate(alignments.size(),offsets,sizes); + std::fill_n(offsets,alignments.size(),scratch_allocator_t::invalid_value); } - // stream the instance/geometry input in + // now upon a failure, our allocations will need to be deallocated + allocCount = alignments.size(); + // prepare build infos + typename AccelerationStructure::DeviceBuildInfo buildInfo; + buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr(scratchBuffer)}; + buildInfo.buildFlags = buildFlags; + buildInfo.dstAS = as; + // abortion backup + bool success = true; + const auto geometryRangeInfoOffset = geometryRangeInfo.size(); + const auto trianglesOffset = triangles.size(); + const auto aabbsOffset = aabbs.size(); + const size_t trackedBLASesOffset = trackedBLASes.size(); + if constexpr (IsTLAS) { - bool success = true; -// TODO: make sure the overflow submit work callback is doing some CPU work + const auto instances = canonical->getInstances(); + const auto instanceCount = static_cast(instances.size()); + // stream the instance/geometry input in { struct FillInstances : IUtilities::IUpstreamingDataProducer { @@ -4620,35 +4899,39 @@ if (worstSize>minScratchSize) assert(offsetInRange%16==0); uint32_t bytesWritten = 0; - while (true) + while (instanceIndex=blockSize) - return bytesWritten; - auto found = blasBuildMap->find(instance.getBase().blas.get()); - assert(found!=blasBuildMap->end()); - const auto& blas = found->second.gpuBLAS; - dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations()); - dedupBLASesUsed->emplace(blas); - if (--found->second.remainingUsages == 0) - blasBuildMap->erase(found); + if (newWritten>blockSize) + break; + auto found = instanceMap->find(instance.getBase().blas.get()); + auto blas = found->second.get(); + if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end()) + blas = found->second.get(); + trackedBLASes->emplace_back(blas); + dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations()); bytesWritten = newWritten; } + return bytesWritten; } - SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap; - core::unordered_set>* dedupBLASesUsed; + const compacted_blas_map_t* compactedBLASMap; + core::vector>* trackedBLASes; + SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap; std::span instances; uint32_t instanceIndex = 0; }; FillInstances fillInstances; - fillInstances.blasBuildMap = &reservations.m_blasBuildMap; - fillInstances.dedupBLASesUsed = &dedupBLASesUsed; + fillInstances.compactedBLASMap = &compactedBLASMap; + fillInstances.trackedBLASes = &trackedBLASes; + fillInstances.instanceMap = &asToBuild.second.instanceMap; fillInstances.instances = instances; success = streamDataToScratch(offsets[1],sizes[1],fillInstances); + // provoke refcounting bugs right away + asToBuild.second.instanceMap.clear(); } if (success && as->usesMotion()) { @@ -4678,43 +4961,130 @@ if (worstSize>minScratchSize) fillInstancePointers.instanceAddress = scratchBuffer->getDeviceAddress()+offsets[1]; success = streamDataToScratch(offsets[2],sizes[2],fillInstancePointers); } - // current recording buffer may have changed - xferCmdBuf = params.transfer->getCommandBufferForRecording(); - if (!success) + // + buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion(); + // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones + buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(scratchBuffer)}; + // be based cause vectors can grow + using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; + buildInfo.trackedBLASes = {reinterpret_cast(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset}; + // no special extra byte offset into the instance buffer + rangeInfos.emplace_back(instanceCount,0u); + } + else + { + buildInfo.geometryCount = canonical->getGeometryCount(); + const auto* offsetIt = offsets+1; + const auto* sizeIt = sizes+1; + const auto primitiveCounts = canonical->getGeometryPrimitiveCounts(); + for (const auto count : primitiveCounts) + geometryRangeInfo.push_back({ + .primitiveCount = count, + .primitiveByteOffset = 0, + .firstVertex = 0, + .transformByteOffset = 0 + }); + const uint32_t* pPrimitiveCounts = primitiveCounts.data(); + IUtilities::CMemcpyUpstreamingDataProducer memcpyCallback; + if (buildFlags.hasFlags(GeometryIsAABBFlag)) { - markFailureInStaging("Uploading Instance Data for TLAS build failed",tlasToBuild.canonical,as,pFoundHash); - continue; + for (const auto& geom : canonical->getAABBGeometries()) + if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount) + { + auto offset = *(offsetIt++); + memcpyCallback.data = reinterpret_cast(geom.data.buffer->getPointer())+geom.data.offset; + if (!streamDataToScratch(offset,*(sizeIt++),memcpyCallback)) + break; + aabbs.push_back({ + .data = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}, + .stride = geom.stride, + .geometryFlags = geom.geometryFlags + }); + } + buildInfo.aabbs = reinterpret_cast* const&>(aabbsOffset); + } + else + { + for (const auto& geom : canonical->getTriangleGeometries()) + if (const auto triCount=*(pPrimitiveCounts++); triCount) + { + auto& outGeom = triangles.emplace_back(); + const auto origSize = *(sizeIt++); + const auto origOffset = *(offsetIt++); + auto offset = origOffset; + auto size = geom.vertexStride*(geom.maxVertex+1); + for (auto i=0; i<2; i++) + if (geom.vertexData[i]) // could assert that it must be true for i==0 + { + outGeom.vertexData[i] = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + memcpyCallback.data = reinterpret_cast(geom.vertexData[i].buffer->getPointer())+geom.vertexData[i].offset; + if (!streamDataToScratch(offset,size,memcpyCallback)) + break; + offset += size; + } + if (geom.hasTransform()) + { + offset = core::alignUp(offset,alignof(float)); + outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + memcpyCallback.data = &geom.transform; + if (!streamDataToScratch(offset,sizeof(geom.transform),memcpyCallback)) + break; + offset += sizeof(geom.transform); + } + switch (geom.indexType) + { + case E_INDEX_TYPE::EIT_16BIT: [[fallthrough]]; + case E_INDEX_TYPE::EIT_32BIT: + { + const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t); + offset = core::alignUp(offset,alignment); + outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr(scratchBuffer)}; + size = triCount*3*alignment; + memcpyCallback.data = reinterpret_cast(geom.indexData.buffer->getPointer())+geom.indexData.offset; + success = streamDataToScratch(offset,size,memcpyCallback); + offset += size; + break; + } + default: + break; + } + assert(offset-origOffset<=origSize); + if (!success) + break; + outGeom.maxVertex = geom.maxVertex; + outGeom.vertexStride = geom.vertexStride; + outGeom.vertexFormat = geom.vertexFormat; + outGeom.indexType = geom.indexType; + outGeom.geometryFlags = geom.geometryFlags; + } + buildInfo.triangles = reinterpret_cast* const&>(trianglesOffset); } - // let go of canonical asset (may free RAM) - tlasToBuild.canonical = nullptr; + success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size(); + rangeInfos.push_back(reinterpret_cast(geometryRangeInfoOffset)); } - // prepare build infos - auto& buildInfo = buildInfos.emplace_back(); - buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr(scratchBuffer)}; - buildInfo.buildFlags = tlasToBuild.getBuildFlags(); - buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion(); - buildInfo.dstAS = as; - // note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones - buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr(scratchBuffer)}; - // be based cause vectors can grow + if (!success) { - const auto offset = trackedBLASes.size(); - using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**; - buildInfo.trackedBLASes = {reinterpret_cast(offset),dedupBLASesUsed.size()}; - for (auto& blas : dedupBLASesUsed) - trackedBLASes.emplace_back(std::move(blas)); - + rangeInfos.resize(buildInfos.size()); + geometryRangeInfo.resize(geometryRangeInfoOffset); + triangles.resize(trianglesOffset); + aabbs.resize(aabbsOffset); + trackedBLASes.resize(trackedBLASesOffset); + markFailure("Uploading Input Data for Accleration Structure build failed",&canonical,pFound); + continue; } - // no special extra byte offset into the instance buffer - rangeInfos.emplace_back(instanceCount,0u); + buildInfos.emplace_back(std::move(buildInfo)); + allocCount = 0; + // let go of canonical asset (may free RAM) + canonical = nullptr; // - const bool willCompact = tlasToBuild.compact(); + const bool willCompact = asToBuild.second.compact; if (willCompact) compactions.push_back(as); // enqueue ownership release if necessary if (finalOwnerQueueFamily!=IQueue::FamilyIgnored) { - compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size()); + if (willCompact) + compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size()); ownershipTransfers.push_back({ .barrier = { .dep = { @@ -4728,136 +5098,186 @@ if (worstSize>minScratchSize) .range = backingRange }); } - else + else if (willCompact) compactedOwnershipReleaseIndices.push_back(~0u); } - reservations.m_blasBuildMap.clear(); // finish the last batch recordBuildCommands(); + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures END"); + computeCmdBuf->cmdbuf->endDebugMarker(); + // provoke refcounting bugs + asesToBuild.clear(); + // flush all ranged before potential submit if (!flushRanges.empty()) { device->flushMappedMemoryRanges(flushRanges); flushRanges.clear(); } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); } - tlasesToBuild.clear(); - // compact - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes START"); - computeCmdBuf->cmdbuf->endDebugMarker(); - // compact needs to wait for Build then record queries + + // Not messing around with listing AS backing buffers individually, ergonomics of that are null + const asset::SMemoryBarrier readASInASCompactBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, + // TODO: do queries or query retrieval have a stage? + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT + }; if (!compactions.empty() && pipelineBarrier(computeCmdBuf,{.memBarriers={&readASInASCompactBarrier,1}},"Failed to sync Acceleration Structure builds with compactions!") && computeCmdBuf->cmdbuf->resetQueryPool(queryPool.get(),0,compactions.size()) && computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0) ) { - // submit cause host needs to read the queries - drainCompute(); + // clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results + drainBoth(); // get queries core::vector sizes(compactions.size()); - if (device->getQueryPoolResults( - queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t), - bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT - )) + if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT)) { - auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void - { - logger.log("Failed to %s for \"%s\"", system::ILogger::ELL_ERROR,as->getObjectDebugName()); - }; - // TODO: normally we'd iteratively record as many compactions as we can, but we don't have a mechanism to release already compacted TLASes, we'd need to defer the writing of the TLAS to the Descriptor Set till later - // create and allocate backing buffers for compacted TLASes - core::vector> backingBuffers(compactions.size()); + logger.log("Failed to Query %sLevelAccelerationStructure compacted sizes, skipping compaction!",system::ILogger::ELL_ERROR,IsTLAS ? "Top":"Bottom"); + return {}; + } + // + auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void + { + logger.log("Failed to %s for \"%s\"",system::ILogger::ELL_ERROR,msg,as->getObjectDebugName()); + }; + // try to allocate memory for + core::vector> backingBuffers(compactions.size()); + { + MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger); + // create + for (size_t i=0; i(compactions[i]); + assert(as); + // silently skip if not worth it + if (!params.confirmCompact(sizes[i],as)) + { + logger.log("Compaction not confirmed for \"%s\" would be compacted size is %d, original %d.",system::ILogger::ELL_DEBUG,as->getObjectDebugName(),sizes[i],as->getCreationParams().bufferRange.size); + continue; + } + // create backing buffer and request an allocation for it { - const auto* as = static_cast(compactions[i]); - assert(as); - // silently skip if not worth it - if (!params.confirmCompact(sizes[i],as)) + const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get(); + assert(oldBuffer); + // This is a Spec limit/rpomise we don't even expose it + constexpr size_t MinASBufferAlignment = 256u; + using usage_f = IGPUBuffer::E_USAGE_FLAGS; + IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage=usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}}; + // same sharing setup as the previous AS buffer + creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount; + creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices; + auto buf = device->createBuffer(std::move(creationParams)); + if (!buf) + { + logFail("create Buffer backing the Compacted Acceleration Structure",as); continue; - smart_refctd_ptr buff; + } + auto bufReqs = buf->getMemoryReqs(); + backingBuffers[i].value = std::move(buf); + // allocate new memory - definitely don't want to be raytracing from across the PCIE slot + if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits())) { - const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get(); - assert(oldBuffer); - // - constexpr size_t MinASBufferAlignment = 256u; - using usage_f = IGPUBuffer::E_USAGE_FLAGS; - IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage = usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}}; - creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount; - creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices; - auto buf = device->createBuffer(std::move(creationParams)); - if (!buf) - { - logFail("create Buffer backing the Compacted Acceleration Structure",as); - continue; - } - // allocate new memory - auto bufReqs = buff->getMemoryReqs(); - // definitely don't want to be raytracing from across the PCIE slot - if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits())) - { - logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as); - continue; - } - backingBuffers[i].value = std::move(buf); + logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as); + continue; } } - // allocate memory for the buffers - deferredAllocator.finalize(); } + // allocate memory for the buffers + deferredAllocator.finalize(); + unordered_map> retval; + retval.reserve(compactions.size()); // recreate Acceleration Structures for (size_t i=0; i(compactions[i]); + const auto* srcAS = static_cast(compactions[i]); auto& backingBuffer = backingBuffers[i].value; if (!backingBuffer->getBoundMemory().isValid()) { - logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",as); - continue; // reason to end a batch, see the TODO above + logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",srcAS); + continue; + } + smart_refctd_ptr compactedAS; + { + typename AccelerationStructure::SCreationParams creationParams = {srcAS->getCreationParams()}; + creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)}; + if constexpr (IsTLAS) + { + creationParams.maxInstanceCount = srcAS->getMaxInstanceCount(); + compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams)); + } + else + compactedAS = device->createBottomLevelAccelerationStructure(std::move(creationParams)); } - IGPUTopLevelAccelerationStructure::SCreationParams creationParams = {as->getCreationParams()}; - creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)}; - creationParams.maxInstanceCount = as->getMaxInstanceCount(); - auto compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams)); if (!compactedAS) { - logFail("create the Compacted Acceleration Structure",as); + logFail("create the Compacted Acceleration Structure",srcAS); continue; } // set the debug name { - std::string debugName = as->getObjectDebugName(); + std::string debugName = srcAS->getObjectDebugName(); debugName += " compacted"; compactedAS->setObjectDebugName(debugName.c_str()); } // record compaction - if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=as,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT})) + if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=srcAS,.dst=compactedAS.get(),.compact=true})) { logFail("record Acceleration Structure compaction",compactedAS.get()); continue; } - // modify the ownership release + // modify the ownership release to be for the final compacted AS if (const auto ix=compactedOwnershipReleaseIndices[i]; ixgetCreationParams().bufferRange; // swap out the conversion result - const auto foundIx = outputReverseMap.find(as); + const auto foundIx = outputReverseMap.find(srcAS); if (foundIx!=outputReverseMap.end()) { - auto& resultOutput = std::get>(reservations.m_gpuObjects); + auto& resultOutput = std::get>(reservations.m_gpuObjects); resultOutput[foundIx->second].value = compactedAS; } + // overwrite staging cache + auto pFound = findInStaging.template operator()(srcAS); + pFound->second.gpuRef = compactedAS; // insert into compaction map - compactedTLASMap[as] = std::move(compactedAS); + retval[srcAS] = std::move(compactedAS); } + return retval; } + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures START"); + computeCmdBuf->cmdbuf->endDebugMarker(); + computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures END"); + computeCmdBuf->cmdbuf->endDebugMarker(); + } + return {}; + }; + + // compacted BLASes need to be substituted in cache and TLAS Build Inputs + compactedBLASMap = buildAndCompactASes.template operator()(blasesToBuild); + // Device TLAS builds + if (tlasCount) + { + // either we built no BLASes (remember we could retrieve already built ones from cache) + if (blasCount) + { + // Or we barrier for the previous compactions or builds (a single pipeline barrier to ensure BLASes build before TLASes is needed) + const asset::SMemoryBarrier readBLASInTLASBuildBarrier = { + // the last use of the source BLAS could have been a build or a compaction + .srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT, + .srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT, + .dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT + }; + // submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds + if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!")) + drainBoth(); + else + failedBLASBarrier = true; } - computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END"); - computeCmdBuf->cmdbuf->endDebugMarker(); + compactedTLASMap = buildAndCompactASes.template operator()(tlasesToBuild); } // release ownership @@ -4880,7 +5300,7 @@ if (worstSize>minScratchSize) retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value}); } // reset original callback - params.transfer->overflowCallback = origXferStallCallback; + params.transfer->overflowCallback = std::move(origXferStallCallback); // Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility, // and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because: @@ -4898,187 +5318,147 @@ if (worstSize>minScratchSize) } } + // finish host tasks if not done yet + hostUploadBuffers([]()->bool{return true;}); + // in the future we'll also finish host image copies - // Descriptor Sets need their TLAS descriptors substituted if they've been compacted - // want to check if deps successfully exist - auto missingDependent = [&reservations](const typename asset_traits::video_t* dep)->bool - { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - auto found = stagingCache.find(const_cast::video_t*>(dep)); - // this only checks if whether we had to convert and failed - if (found!=stagingCache.end() && found->second.value==CHashCache::NoContentHash) - return true; - // but the dependent might be in readCache of one or more converters, so if in doubt assume its okay - return false; - }; - // insert items into cache if overflows handled fine and commandbuffers ready to be recorded - auto mergeCache = [&]()->void + // check dependents before inserting into cache + if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE) { - auto& stagingCache = std::get>(reservations.m_stagingCaches); - auto& cache = std::get>(m_caches); - cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); - cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); - constexpr bool IsTLAS = std::is_same_v; - for (auto& item : stagingCache) - if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped + auto checkDependents = [&]()->void { - // rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set - bool depsMissing = false; - // only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) - if constexpr (IsTLAS) - { - // A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job - } - - if constexpr (std::is_same_v) - depsMissing = missingDependent.template operator()(item.first->getUnderlyingBuffer()); - if constexpr (std::is_same_v) - depsMissing = missingDependent.template operator()(item.first->getCreationParameters().image.get()); - if constexpr (std::is_same_v) - { - const IGPUDescriptorSetLayout* layout = item.first->getLayout(); - // check samplers - { - const auto count = layout->getTotalMutableCombinedSamplerCount(); - const auto* samplers = item.first->getAllMutableCombinedSamplers(); - for (auto i=0u; !depsMissing && i(samplers[i].get()); - } - for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) + auto& stagingCache = std::get>(reservations.m_stagingCaches); + phmap::erase_if(stagingCache,[&](auto& item)->bool { - const auto type = static_cast(i); - const auto count = layout->getTotalDescriptorCount(type); - auto* psDescriptors = item.first->getAllDescriptors(type); - if (!psDescriptors) - continue; - for (auto i=0u; !depsMissing && i) + depsMissing = missingDependent.template operator()(pGpuObj->getUnderlyingBuffer()); + if constexpr (std::is_same_v) + depsMissing = missingDependent.template operator()(pGpuObj->getCreationParameters().image.get()); + if constexpr (std::is_same_v) { - auto* untypedDesc = psDescriptors[i].get(); - if (untypedDesc) - switch (asset::IDescriptor::GetTypeCategory(type)) + const IGPUDescriptorSetLayout* layout = pGpuObj->getLayout(); + // check samplers { - case asset::IDescriptor::EC_BUFFER: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_SAMPLER: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_IMAGE: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_BUFFER_VIEW: - depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); - break; - case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: + const auto count = layout->getTotalMutableCombinedSamplerCount(); + const auto* samplers = pGpuObj->getAllMutableCombinedSamplers(); + for (auto i=0u; !depsMissing && i(samplers[i].get()); + } + for (auto i=0u; !depsMissing && i(asset::IDescriptor::E_TYPE::ET_COUNT); i++) + { + const auto type = static_cast(i); + const auto count = layout->getTotalDescriptorCount(type); + auto* psDescriptors = pGpuObj->getAllDescriptors(type); + if (!psDescriptors) + continue; + for (auto i=0u; !depsMissing && i(untypedDesc); - // successfully written a TLAS into the binding, nothing to check - if (tlas) - break; - // we have a null TLAS in the binding, and we have to check if we were supposed to have one in it - using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; - const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); - const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i)); - const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data; - const auto foundWrite = reservations.m_deferredTLASDescriptorWrites.find({ - .dstSet = item.first, - .binding = redirect.getBinding(bindingRange).data, - .arrayElement = i-firstElementOffset - }); - // was scheduled to write some TLAS to this binding, but TLAS is now null - depsMissing = foundWrite!=reservations.m_deferredTLASDescriptorWrites.end() && !foundWrite->tlas; - break; + auto* untypedDesc = psDescriptors[i].get(); + if (untypedDesc) + switch (asset::IDescriptor::GetTypeCategory(type)) + { + case asset::IDescriptor::EC_BUFFER: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_SAMPLER: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_IMAGE: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_BUFFER_VIEW: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + case asset::IDescriptor::EC_ACCELERATION_STRUCTURE: + depsMissing = missingDependent.template operator()(static_cast(untypedDesc)); + break; + default: + assert(false); + depsMissing = true; + break; + } } - default: - assert(false); - depsMissing = true; - break; } } + if (depsMissing) + { + smart_refctd_ptr dummy; + // I know what I'm doing (breaking the promise of the `erase_if` to not mutate the inputs) + markFailure("because conversion of a dependant failed!",&dummy,&item.second); + } + return depsMissing; } - } - auto* pGpuObj = item.first; - if (depsMissing) - { - const auto* hashAsU64 = reinterpret_cast(item.second.value.data); - logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, getLoggingLabel(*pGpuObj)); - // wipe self, to let users know - item.second.value = {}; - continue; - } - // The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away - // For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache - if constexpr (IsTLAS) + ); + }; + // Bottom up, only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable) + // A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job + checkDependents.template operator()(); + checkDependents.template operator()(); + checkDependents.template operator()(); +// mergeCache.template operator()(); + // overwrite the compacted TLASes in Descriptor Sets + if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty()) + { + core::vector writes; + writes.reserve(tlasRewriteSet.size()); + core::vector infos(tlasRewriteSet.size()); + auto* pInfo = infos.data(); + for (auto& entry : tlasRewriteSet) { - auto found = compactedTLASMap.find(pGpuObj); - if (found!=compactedTLASMap.end()) - pGpuObj = found->second.get(); - + auto* const dstSet = entry.dstSet; + // we need to check if the descriptor set itself didn't get deleted in the meantime + if (missingDependent.template operator()(dstSet)) + continue; + // rewtrieve the binding from the TLAS + const auto* const tlas = static_cast(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get()); + assert(tlas); + // only rewrite if successfully compacted + if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end()) + { + pInfo->desc = foundCompacted->second; + using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect; + const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE); + const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset); + const auto firstElementOffset = redirect.getStorageOffset(bindingRange); + writes.push_back({ + .dstSet = dstSet, + .binding = redirect.getBinding(bindingRange).data, + .arrayElement = entry.storageOffset.data-firstElementOffset.data, + .count = 1, + .info = pInfo++ + }); + } } - // We have success now, but ask callback if we write to the new cache. - if (!params.writeCache(item.second)) // TODO: let the user know the pointer to the GPU Object too? - continue; - asset_cached_t cached; - cached.value = core::smart_refctd_ptr::video_t>(pGpuObj); - cache.m_reverseMap.emplace(pGpuObj,item.second); - cache.m_forwardMap.emplace(item.second,std::move(cached)); + // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) + if (!writes.empty() && !device->updateDescriptorSets(writes,{})) + logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR); } - }; - // again, need to go bottom up so we can check dependencies being successes - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - mergeCache.template operator()(); - // write the TLASes into Descriptor Set finally - if (auto& tlasWriteMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteMap.empty()) + } + + // insert items into cache if overflows handled fine and commandbuffers ready to be recorded + core::for_each_in_tuple(reservations.m_stagingCaches,[&](SReserveResult::staging_cache_t& stagingCache)->void { - core::vector writes; - writes.reserve(tlasWriteMap.size()); - core::vector infos(writes.size()); - auto* pInfo = infos.data(); - for (auto& inWrite : tlasWriteMap) + auto& cache = std::get>(m_caches); + cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size()); + cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size()); + for (auto& item : stagingCache) + if (item.second.gpuRef) // not wiped { - // I know what I'm doing, this member has no influence on the set key hash - auto& tlas = const_cast&>(inWrite.tlas); - assert(tlas); - if (missingDependent.template operator()(tlas.get())) - { - tlas = nullptr; + // We have success now, but ask callback if we write to the new cache. + if (!params.writeCache(item.second.cacheKey)) // TODO: let the user know the pointer to the GPU Object too? continue; - } - if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end()) - tlas = foundCompacted->second; - pInfo->desc = tlas; - writes.push_back({ - .dstSet = inWrite.dstSet, - .binding = inWrite.binding, - .arrayElement = inWrite.arrayElement, - .count = 1, - .info = pInfo++ - }); + asset_cached_t cached; + cached.value = std::move(item.second.gpuRef); + cache.m_reverseMap.emplace(item.first,item.second.cacheKey); + cache.m_forwardMap.emplace(item.second.cacheKey,std::move(cached)); } - // not strictly necessary, just provoking refcounting bugs right away if they exist - compactedTLASMap.clear(); - // if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing) - if (!writes.empty() && !device->updateDescriptorSets(writes,{})) - for (auto& inWrite : tlasWriteMap) - const_cast&>(inWrite.tlas) = nullptr; - } - mergeCache.template operator()(); - // needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now - reservations.m_deferredTLASDescriptorWrites.clear(); -// mergeCache.template operator()(); + // provoke refcounting bugs ASAP + stagingCache.clear(); + }); // no submit was necessary, so should signal the extra semaphores from the host if (!retval.blocking()) diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp index 4c3bbaa03c..924c337cbe 100644 --- a/src/nbl/video/utilities/CComputeBlit.cpp +++ b/src/nbl/video/utilities/CComputeBlit.cpp @@ -39,7 +39,7 @@ auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> const auto sharedMemoryPerInvocation = core::max(singlePixelStorage*4,info.sharedMemoryPerInvocation); retval.sharedMemorySize = sharedMemoryPerInvocation*retval.workgroupSize; - const auto* layout = info.layout; + auto* layout = info.layout; // const auto common = [&]()->std::string @@ -66,7 +66,7 @@ struct ConstevalParameters }(); auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr { - auto shader = make_smart_refctd_ptr( + auto shader = make_smart_refctd_ptr( (common+"\n#include \""+mainPath+"\"\n").c_str(), IShader::E_CONTENT_TYPE::ECT_HLSL, mainPath @@ -77,14 +77,16 @@ struct ConstevalParameters source->setContentHash(source->computeContentHash()); } - ICPUComputePipeline::SCreationParams params = {}; - params.layout = layout; - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - params.shader.requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)); - // needed for the prefix and reductions to work - params.shader.requireFullSubgroups = true; - return ICPUComputePipeline::create(params); + auto pipeline = ICPUComputePipeline::create(layout); + pipeline->getSpecInfo() = { + .shader = shader, + .entryPoint = "main", + .requiredSubgroupSize = static_cast(findMSB(limits.maxSubgroupSize)), + }; + pipeline->getCachedCreationParams() = { + .requireFullSubgroups = true, + }; + return pipeline; }; // create blit pipeline cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl"); diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt index 1582e9ecd6..bcdcbca531 100644 --- a/tools/nsc/CMakeLists.txt +++ b/tools/nsc/CMakeLists.txt @@ -6,8 +6,11 @@ set(GODBOLT_BINARY_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/compiler-explorer") set(GODBOLT_BINARY_PRETEST_DIRECTORY "${GODBOLT_BINARY_DIRECTORY}/pre-test") set(NBL_NSC_COMPILE_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.compile/$") set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstall") +make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}") set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install") +set(NBL_DOCKER_CTX_DIR "${GODBOLT_BINARY_DIRECTORY}/.ctx") +make_directory("${NBL_DOCKER_CTX_DIR}") set(NBL_DOCKER_INSTALL_BAT_FILENAME install-production.bat) set(NBL_DOCKER_CT_NSC_INSTALL_BAT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/${NBL_DOCKER_INSTALL_BAT_FILENAME}") @@ -56,424 +59,304 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST if(NBL_ENABLE_DOCKER_INTEGRATION) -find_program(DOCKER_EXE - NAMES docker - REQUIRED -) - -find_program(SPIRV_DIS_EXE - NAMES spirv-dis - HINTS "$ENV{VULKAN_SDK_INSTALL_DIRECTORY}/Bin" - HINTS "$ENV{VK_SDK_PATH}/Bin" - HINTS "$ENV{VULKAN_SDK}/Bin" - REQUIRED -) +find_program(DOCKER_EXE NAMES docker REQUIRED) +set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022) +set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022) + +function(PROMOTE_PROCESS_ISOLATION BASE VAR) + set(${VAR} True) + + macro(INSPECT IMAGE) + execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} + RESULT_VARIABLE INSPECTION_OK + OUTPUT_VARIABLE TARGET_KERNEL + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endmacro() + + macro(TO_PROCESS IMAGE TARGET_KERNEL) + execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K + RESULT_VARIABLE PROCESS_ISOLATION_OK + OUTPUT_QUIET ERROR_QUIET + ) + + if(${PROCESS_ISOLATION_OK} EQUAL 0) + message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation") + else() + set(${VAR} False) + message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.") + endif() + endmacro() + + INSPECT(${BASE}) + + if(${INSPECTION_OK} EQUAL 0) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + else() + message(STATUS "\"${BASE}\" not found in local registry, pulling...") + execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE}) + + INSPECT(${BASE}) + TO_PROCESS(${BASE} ${TARGET_KERNEL}) + endif() + + set(${VAR} ${${VAR}} PARENT_SCOPE) +endfunction() -cmake_path(GET Vulkan_INCLUDE_DIR PARENT_PATH VULKAN_SDK_INSTALL_DIRECTORY) -get_filename_component(VULKAN_SDK_VERSION "${VULKAN_SDK_INSTALL_DIRECTORY}" NAME) +PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION) -if(NOT EXISTS "${VULKAN_SDK_INSTALL_DIRECTORY}") - message(FATAL_ERROR "Internal error, VULKAN_SDK_INSTALL_DIRECTORY doesn't exist") +if(NOT USE_PROCESS_ISOLATION) + # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump + # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies + # to collect *all* missing deps and copy (FROM at least server core) to destination nano + # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs + # BUT it means violating EULA, hence we are not going to support it, also (**) + message(FATAL_ERROR "HyperV is NOT supported! Update your OS!") endif() -find_program(CTEST_EXE - NAMES ctest - REQUIRED -) +function(GET_LABEL BASE_IMAGE LABEL VAR) + set(FORMAT "{{ index .Config.Labels \"${LABEL}\" }}") + execute_process(COMMAND ${DOCKER_EXE} inspect --format=${FORMAT} ${BASE_IMAGE} + OUTPUT_VARIABLE OUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE ERR + RESULT_VARIABLE RES + ) -set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake") + if(NOT RES EQUAL 0) + message(WARNING "Could not get \"${LABEL}\" label from \"${BASE_IMAGE}\" image, it doesn't exist!") + endif() -set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "C:\\\\nsc\\\\install") -string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) -set(NSC_RELEASE_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -set(NSC_RELWITHDEBINFO_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -set(NSC_DEBUG_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}") -cmake_path(NATIVE_PATH NSC_RELEASE_BUILD_INFO NORMALIZE NSC_RELEASE_BUILD_INFO) -cmake_path(NATIVE_PATH NSC_RELWITHDEBINFO_BUILD_INFO NORMALIZE NSC_RELWITHDEBINFO_BUILD_INFO) -cmake_path(NATIVE_PATH NSC_DEBUG_BUILD_INFO NORMALIZE NSC_DEBUG_BUILD_INFO) - -set(NBL_INSTALL_DIRECTORY "${NBL_DOCKER_CT_NSC_VOLUME_TARGET}") -cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) - -set(NBL_BUILD_INFO_POSTPROCESS_COMMAND - "${CMAKE_COMMAND}" - "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}" - "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}" - "-DNBL_OUTPUT_EXE_OVERRIDE=$" # as in CT, it's *not* host exe location! - -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake" -) + set(${VAR} "${OUT}" PARENT_SCOPE) +endfunction() -cmake_path(GET SPIRV_DIS_EXE PARENT_PATH VULKAN_SDK_BIN_DIRECTORY) -cmake_path(NATIVE_PATH VULKAN_SDK_BIN_DIRECTORY NORMALIZE VULKAN_SDK_BIN_DIRECTORY) -cmake_path(GET SPIRV_DIS_EXE FILENAME SPIRV_DIS_EXE) -set(CT_SPIRV_DIS_EXE "C:\\vulkan\\${VULKAN_SDK_VERSION}\\bin\\${SPIRV_DIS_EXE}") -cmake_path(NATIVE_PATH CT_SPIRV_DIS_EXE NORMALIZE CT_SPIRV_DIS_EXE) - -set(NBL_CE_GENERATE_CONFIG_COMMAND - "${CMAKE_COMMAND}" - "-DSPIRV_DIS_EXE=${CT_SPIRV_DIS_EXE}" - "-DNSC_RELEASE_BUILD_INFO=${NSC_RELEASE_BUILD_INFO}" - "-DNSC_RELWITHDEBINFO_BUILD_INFO=${NSC_RELWITHDEBINFO_BUILD_INFO}" - "-DNSC_DEBUG_BUILD_INFO=${NSC_DEBUG_BUILD_INFO}" - "-DOUTPUT_CONFIG_FILE=${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}" - -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake" -) +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.title ORG_LABEL_TITLE) +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.source ORG_LABEL_SOURCE) +GET_LABEL(${BASE_IMAGE} org.opencontainers.image.description ORG_LABEL_DESCRIPTION) -set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_ROOT_PATH}/docker/compiler-explorer/compose.yml") -cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE) -set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml") +find_program(CTEST_EXE NAMES ctest REQUIRED) +find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED) -include(InstallRequiredSystemLibraries) +set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU) +find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED) -string(REPLACE "v" "VC" TARGET_DCRT ${CMAKE_VS_PLATFORM_TOOLSET}) -set(DEBUG_CRT_RELATIVE debug_nonredist/x64/Microsoft.${TARGET_DCRT}.DebugCRT) -set(DEBUG_CRT_DIRECTORY_SOURCE "${MSVC_REDIST_DIR}/${DEBUG_CRT_RELATIVE}") -cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE NBL_REDIST_DIR) +find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED) +cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR) +cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR) -if(NOT EXISTS "${DEBUG_CRT_DIRECTORY_SOURCE}") - message(FATAL_ERROR "DEBUG_CRT_DIRECTORY_SOURCE = \"${DEBUG_CRT_DIRECTORY_SOURCE}\" doesn't exist!") +if(MSVC_REDIST_BASE) # fallback to our toolset + set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}") +else() + include(InstallRequiredSystemLibraries) + if(NOT MSVC_REDIST_DIR) + message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!") + endif() endif() -set(DEBUG_CRT_DIRECTORY_TARGET "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/.nonredist") -file(MAKE_DIRECTORY "${DEBUG_CRT_DIRECTORY_TARGET}") -file(GLOB CRT_FILES "${DEBUG_CRT_DIRECTORY_SOURCE}/*") +cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) -find_file(UCRTBASED_DLL_PATH - NAMES ucrtbased.dll - REQUIRED +file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false + "${TOOLSET_REDIST_PATH}/x64/*.CRT/*.dll" + "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll" ) -# TODO: (***) ---> THIS GOES TO /docker to CMakeLists.txt file! - -set(BASE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022-amd64) # NOTE: HARDCODED CURRENTLY - -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/package/vulkan:latest" DOCKER_VULKAN_TAG) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/toolset/redist/${CMAKE_CXX_COMPILER_ID}/crt:latest" DOCKER_CRT_TAG) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/devel-compiler-explorer-nsc:latest" DOCKER_DEVEL_TAG) - -cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH) -get_filename_component(REDIST_CRT_TOOLSET_VERSION "${TOOLSET_REDIST_PATH}" NAME) +if(NOT VC_MODULES) + message(FATAL_ERROR "Failed to GLOB for VC Redist modules!") +endif() -function(GEN_DOCKER_CONTENT _CTX_ _OUTPUT_DIRECTORY_ _EXTRA_DOCKERFILE_CONTENT_ _DOCKER_IGNORE_CONTENT_ _S_NAME_ _CT_NAME_ _IMAGE_NAME_ _WITH_BUILD_) +make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes") +make_directory("${NBL_DOCKER_CTX_DIR}/Nabla") +execute_process( + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes" + COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes" +) -set(_OUTPUT_D_PATH_ "${_OUTPUT_DIRECTORY_}/Dockerfile") -set(_OUTPUT_C_PATH_ "${_OUTPUT_DIRECTORY_}/compose.yml") +set(CT_RUNTIMES C:/runtimes) +cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES) -string(CONFIGURE "${_EXTRA_DOCKERFILE_CONTENT_}" _EXTRA_DOCKERFILE_CONTENT_EVAL_ @ONLY) -string(CONFIGURE "${_DOCKER_IGNORE_CONTENT_}" _DOCKER_IGNORE_CONTENT_EVAL_ @ONLY) +set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla") +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE) +cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET) +cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY) -unset(DOCKER_CONTENT) -string(APPEND DOCKER_CONTENT -[=[ +string(CONFIGURE [=[ +# syntax=docker/dockerfile:1 # escape=` -ARG BASE_IMAGE=@BASE_IMAGE@ -FROM ${BASE_IMAGE} -SHELL ["cmd", "/S", "/C"] -@_EXTRA_DOCKERFILE_CONTENT_EVAL_@ -]=] -) +# ---------------- COMPRESS STEP ---------------- +FROM @BASE_IMAGE@ as compress -string(CONFIGURE "${DOCKER_CONTENT}" DOCKER_CONTENT @ONLY) -file(WRITE "${_OUTPUT_D_PATH_}" "${DOCKER_CONTENT}") +COPY --link Runtimes/ C:/pack/Windows/System32/ +COPY --link Nabla/ C:/pack/runtimes/Nabla/ -set(_CTX_TARGET_ "${_OUTPUT_DIRECTORY_}/.ctx") +ARG IMPL_COMPRESSION_OPTIONS=-T0 +ARG IMPL_COMPRESSION_LEVEL=3 -if("${_CTX_}" STREQUAL "") +WORKDIR C:\pack +RUN ` +tar -cf - Windows | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o windows-artifacts.tar.zst && ` +tar -cf - runtimes | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o nabla-artifacts.tar.zst -else() - if(NOT EXISTS "${_CTX_}") - message(FATAL_ERROR "Invalid source context directory doesn't exist! _CTX_: \"${_CTX_}\"") - endif() +# ---------------- FINAL IMAGE ---------------- +FROM @BASE_IMAGE@ - file(COPY "${_CTX_}" DESTINATION "${_CTX_TARGET_}") -endif() +COPY --link --from=compress ["C:/pack/windows-artifacts.tar.zst", "C:/pack/"] +COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"] +COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties -set(_OUTPUT_I_PATH_ "${_CTX_TARGET_}/.dockerignore") - -unset(COMPOSE_CONTENT) -string(APPEND COMPOSE_CONTENT -[=[ -services: - @_S_NAME_@: - build: - context: ./.ctx - dockerfile: "@_OUTPUT_D_PATH_@" - image: @_IMAGE_NAME_@ - container_name: @_CT_NAME_@ - networks: - docker_default: - -networks: - docker_default: - external: true -]=] -) +ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ ` +NBL_EXPLICIT_MODULE_LOAD_LOG=ON -string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY) -file(WRITE "${_OUTPUT_C_PATH_}" "${COMPOSE_CONTENT}") -file(WRITE "${_OUTPUT_I_PATH_}" "${_DOCKER_IGNORE_CONTENT_EVAL_}") +WORKDIR C:/Compiler-Explorer +ENTRYPOINT [ ` + "C:\\unpack.bat", "&&", ` + "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", ` + "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" ` +] -if(_WITH_BUILD_) - execute_process(COMMAND "${DOCKER_EXE}" compose -f "${_OUTPUT_C_PATH_}" build) -endif() -endfunction() +LABEL org.opencontainers.image.title="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_TITLE@" +LABEL org.opencontainers.image.source=https://github.com/Devsh-Graphics-Programming/Nabla +LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_DESCRIPTION@" -# Vulkan -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/vulkan") -set(CT_VULKAN_TARGET vulkan) -GEN_DOCKER_CONTENT("${VULKAN_SDK_INSTALL_DIRECTORY}" "${OUTPUT_DIRECTORY}" -[=[ -COPY ./ "@CT_VULKAN_TARGET@" - -ENV VULKAN_SDK="C:/@CT_VULKAN_TARGET@" -ENV VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@" -LABEL VULKAN_SDK="C:/@CT_VULKAN_TARGET@" -LABEL VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@" -]=] -[=[ -* -!@VULKAN_SDK_VERSION@/Bin/*.dll -!@VULKAN_SDK_VERSION@/Bin/*spirv*.exe -]=] -nabla-dev-env-vulkan -nabla.dev.env.vulkan -${DOCKER_VULKAN_TAG} -ON -) +]=] INSTRUCTIONS @ONLY) -# CRT -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/crt") -set(CT_TOOLSET_REDIST_TARGET toolset_redist) -make_directory("${OUTPUT_DIRECTORY}/.ctx") -file(COPY "${UCRTBASED_DLL_PATH}" DESTINATION "${OUTPUT_DIRECTORY}/.ctx") -GEN_DOCKER_CONTENT("${TOOLSET_REDIST_PATH}" "${OUTPUT_DIRECTORY}" -[=[ -COPY ./ "/@CT_TOOLSET_REDIST_TARGET@" - -ENV REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@" -ENV TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@" -LABEL REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@" -LABEL TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@" -]=] -[=[ -* -!ucrtbased.dll -!@REDIST_CRT_TOOLSET_VERSION@/vc_redist.x64.exe -!@REDIST_CRT_TOOLSET_VERSION@/@DEBUG_CRT_RELATIVE@/*.dll -]=] -nabla-dev-env-crt -nabla.dev.env.crt -${DOCKER_CRT_TAG} -ON -) +set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile") +file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}") -# Devel, combined -set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows) +if(DEFINED ENV{NSC_IMAGE_NAME}) + set(NSC_IMAGE_NAME "$ENV{NSC_IMAGE_NAME}") +else() + set(NSC_IMAGE_NAME nano/godbolt/nsc) +endif() -# NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/devel") -set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}") -set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}") -cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR) -cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR) -set(DEVEL_DOCKERFILE "${OUTPUT_DIRECTORY}/Dockerfile") +set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake") +string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY) +set(OUTPUT_CONFIG_FILE $) + +set(ICU_DIR C:\\Windows\\Globalization\\ICU) +set(ICU_DLL C:\\Windows\\System32\\icu.dll) +if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL}) + # fallback for CI purposes, NOTE: we do NOT distribute those in final image as we have host runner requirements (**) + message(STATUS "\"${ICU_DIR}\" or \"${ICU_DLL}\ not found, fallback: copying them to the runner from \"${CORE_IMAGE}\"") + execute_process(COMMAND "${DOCKER_EXE}" rm -f nano-orphan RESULT_VARIABLE res) + execute_process(COMMAND "${DOCKER_EXE}" run -di --isolation process --name nano-orphan --entrypoint cmd ${CORE_IMAGE} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DIR} ${ICU_DIR} COMMAND_ERROR_IS_FATAL ANY) + execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DLL} ${ICU_DLL} COMMAND_ERROR_IS_FATAL ANY) + message(STATUS "Fallback completed, runner patched!") +endif() -GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}" -[=[ +set(ORPHAN nsc-orphan) -COPY --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@ -COPY --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@ +if(NOT DEFINED NBL_CE_PUBLISH_PORT) + set(NBL_CE_PUBLISH_PORT 80) +endif() -RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install -RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y -RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y +if(NBL_DOCKER_DIND_BUILD) + set(NBL_CE_URL http://${ORPHAN}:${NBL_CE_PUBLISH_PORT}) +else() + set(NBL_CE_URL http://localhost:${NBL_CE_PUBLISH_PORT}) +endif() -]=] -[=[ +set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") +set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") +set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") -]=] -nabla-dev-env-nsc -nabla.dev.env.nsc -${DOCKER_DEVEL_TAG} -OFF +# to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory +string(CONFIGURE [=[ +message(STATUS "Killing remaining NSC orphans") +execute_process(COMMAND "@DOCKER_EXE@" + rm -f "@ORPHAN@" + RESULT_VARIABLE res ) -# <---(***) - -set(NABLA_DEV_ENV_CT_NAME dev.nabla.env.${CMAKE_SYSTEM_NAME}.${CMAKE_CXX_COMPILER_ID}.base) -string(TOLOWER "${NABLA_DEV_ENV_CT_NAME}" NABLA_DEV_ENV_CT_NAME) - -set(COMPOSE_NSC_DEV_SERVICE compiler-explorer-nsc-dev) -string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/compiler-explorer-nsc:latest" COMPOSE_NSC_DEV_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-production-test:latest" COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-prodution-cache:latest" COMPOSE_NSC_PRODUCTION_CACHE_IMAGE) -string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc:latest" COMPOSE_NSC_PRODUCTION_IMAGE) - -string(APPEND COMPOSE_CONTENT -[=[ -services: - @COMPOSE_NSC_DEV_SERVICE@: - container_name: dev.ce.nsc.dev - extends: - file: @NBL_DOCKER_CE_COMPOSE_BASE@ - service: compiler-explorer - build: - context: ./.ctx - dockerfile: @DEVEL_DOCKERFILE@ - image: @COMPOSE_NSC_DEV_IMAGE@ - environment: - NBL_INSTALL_DIRECTORY: "@NBL_INSTALL_DIRECTORY@" - NBL_EXPLICIT_MODULE_LOAD_LOG: "ON" - entrypoint: - - "cmd" - - "/c" - - > - copy C:\\nsc\\install\\hlsl.local.properties.cmake %GIT_GODBOLT_REPOSITORY_PATH%\\etc\\config\\hlsl.local.properties - && npm --prefix %GIT_GODBOLT_REPOSITORY_PATH% run dev -- --language hlsl - volumes: - - type: bind - source: .\install - target: @NBL_DOCKER_CT_NSC_VOLUME_TARGET@ - read_only: true - -networks: - docker_default: - external: true -]=] +message(STATUS "Executing CTests") +execute_process(COMMAND "@CTEST_EXE@" + -C "$" --stop-on-failure + WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@" + COMMAND_ERROR_IS_FATAL ANY ) -string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY) -file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}") -make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx") - -execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_BASE}" build) -execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" build) - -string(APPEND BAT_PRODUCTION_INSTALL -[=[ -@echo off -setlocal - -set BASE_PATH=C:\ - -xcopy "%BASE_PATH%target" "%BASE_PATH%nsc\install" /s /e /h /i /y /f -if %ERRORLEVEL% neq 0 ( - echo [ERROR] Failed to copy C:\target to C:\nsc\install - exit /b %ERRORLEVEL% +message(STATUS "Generating NSC build info") +execute_process(COMMAND "@CMAKE_COMMAND@" + "-DNBL_EXECUTABLE_PATH=@NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH@" + "-DNBL_BUILD_INFO=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@" + "-DNBL_OUTPUT_FILE=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@" + "-DNBL_OUTPUT_EXE_OVERRIDE=$" + -P "@NBL_ROOT_PATH@/cmake/scripts/nbl/nablaBuildInfo.cmake" + COMMAND_ERROR_IS_FATAL ANY ) -if "%GIT_GODBOLT_REPOSITORY_PATH%"=="" ( - echo [ERROR] Environment variable GIT_GODBOLT_REPOSITORY_PATH is not set! - exit /b 1 +message(STATUS "Generating NSC godbolt config") +execute_process(COMMAND "@CMAKE_COMMAND@" + "-DSPIRV_DIS_EXE=spirv-dis.exe" + "-DNSC_RELEASE_BUILD_INFO=$" + "-DNSC_RELWITHDEBINFO_BUILD_INFO=$" + "-DNSC_DEBUG_BUILD_INFO=$" + "-DOUTPUT_CONFIG_FILE=@OUTPUT_CONFIG_FILE@" + -P "@CMAKE_CURRENT_SOURCE_DIR@/ce-generate-config.cmake" + COMMAND_ERROR_IS_FATAL ANY ) -copy "%BASE_PATH%nsc\install\hlsl.local.properties.cmake" "%GIT_GODBOLT_REPOSITORY_PATH%\etc\config\hlsl.local.properties" -if %ERRORLEVEL% neq 0 ( - echo [ERROR] Failed to copy HLSL properties file - exit /b %ERRORLEVEL% +message(STATUS "Updating NSC package context") +execute_process(COMMAND "@CMAKE_COMMAND@" -E copy_directory_if_different + "$" + "@NBL_DOCKER_CTX_DIR@/Nabla" + COMMAND_ERROR_IS_FATAL ANY ) -echo [SUCCESS] All production files copied successfully. -exit /b 0 -]=] +message(STATUS "Building NSC Godbolt image") +string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%dT%H:%M:%SZ" UTC) +execute_process(COMMAND "@DOCKER_EXE@" build --isolation process + --label=org.opencontainers.image.created="${BUILD_TIMESTAMP}" + -f "@DOCKERFILE@" -t @NSC_IMAGE_NAME@ "@NBL_DOCKER_CTX_DIR@" + COMMAND_ERROR_IS_FATAL ANY ) -string(CONFIGURE "${BAT_PRODUCTION_INSTALL}" BAT_PRODUCTION_INSTALL @ONLY) -file(WRITE "${NBL_DOCKER_CT_NSC_INSTALL_BAT}" "${BAT_PRODUCTION_INSTALL}") - -set(NBL_CE_URL http://localhost:80) -set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py") -set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py") -set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json") +message(STATUS "Running new NSC orphan container") +execute_process(COMMAND "@DOCKER_EXE@" run -di -p @NBL_CE_PUBLISH_PORT@:10240 --isolation process + --name "@ORPHAN@" --network docker_default + -v $ + -v $ + @NSC_IMAGE_NAME@ + COMMAND_ERROR_IS_FATAL ANY +) -add_custom_target(run-compiler-explorer - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Performing Pre-Test..." - COMMAND "${CTEST_EXE}" -C $ --stop-on-failure - COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND} - COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" stop ${COMPOSE_NSC_DEV_SERVICE} - COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND} - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "OK! Performing executables hot-swap..." - COMMAND "${CMAKE_COMMAND}" -E copy_directory "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}" - COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" up -d ${COMPOSE_NSC_DEV_SERVICE} - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Checking health of Compiler Explorer service..." - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 25 - COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC is able to compile basic shader file..." - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" - COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC is healthy." - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - VERBATIM - USES_TERMINAL +message(STATUS "Healthy check") +execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_HEALTHY_CHECK_PY@" + --url "@NBL_CE_URL@" --interval 5 --ticks 12 + COMMAND_ERROR_IS_FATAL ANY ) -add_custom_target(is-compiler-explorer-running - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --ticks 1 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers - VERBATIM - USES_TERMINAL +message(STATUS "Post Basic NSC shader compile check") +execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_ENDPOINT_PY@" + --url "@NBL_CE_URL@" + --endpoint /api/compiler/nsc_$>_upstream/compile + --method POST --json "@NBL_NSC_BASIC_HLSL_JPAYLOAD@" + COMMAND_ERROR_IS_FATAL ANY ) -# Production NSC image -set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/nsc-production") -set(BASE_IMAGE "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}") -set(NBL_DOCKER_TMP_PRODUCTION_TARGET "C:\\target") -GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}" -[=[ -LABEL maintainer="Arkadiusz Lachowicz " ` - org.opencontainers.image.authors="Arkadiusz Lachowicz " ` - org.opencontainers.image.title="Compiler Explorer with Nabla Shader Compilers in Docker" ` - org.opencontainers.image.description="Docker image to run Compiler Explorer instance with Nabla Shader Compilers" ` - org.opencontainers.image.url="https://github.com/Devsh-Graphics-Programming/Nabla" ` - org.opencontainers.image.source="https://github.com/Devsh-Graphics-Programming/Nabla" ` - org.opencontainers.image.documentation="https://github.com/Devsh-Graphics-Programming/Nabla/tree/master/tools/nsc/docker" - -ENTRYPOINT ["powershell.exe", "-ExecutionPolicy", "Bypass", "-Command", "npm", "--prefix", "$env:GIT_GODBOLT_REPOSITORY_PATH", "start", "--", "--language", "hlsl"] -]=] -[=[ - -]=] -nsc-ce-production-cache-webpack -nsc.ce.production.cache.webpack -${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE} -OFF +message(STATUS "Printing NSC container logs") +execute_process(COMMAND "@DOCKER_EXE@" + logs "@ORPHAN@" + COMMAND_ERROR_IS_FATAL ANY ) -set(NBL_CE_URL http://localhost:6969) - -add_custom_target(create-production-compiler-explorer - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Removing any remaining pre-test orphan containers..." - COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-run-test || "${CMAKE_COMMAND}" -E true - COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-cache-webpack || "${CMAKE_COMMAND}" -E true - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Creating pre-test production image..." - COMMAND "${DOCKER_EXE}" run -dit -v "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}:${NBL_DOCKER_TMP_PRODUCTION_TARGET}" --name production-ce-orphan-run-test --entrypoint "cmd" "${COMPOSE_NSC_DEV_IMAGE}" - COMMAND "${DOCKER_EXE}" exec production-ce-orphan-run-test "${NBL_DOCKER_TMP_PRODUCTION_TARGET}\\${NBL_DOCKER_INSTALL_BAT_FILENAME}" - COMMAND "${DOCKER_EXE}" stop production-ce-orphan-run-test - COMMAND "${DOCKER_EXE}" commit -m "Copy NSC install redists" production-ce-orphan-run-test "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}" - COMMAND "${DOCKER_EXE}" compose build - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running pre-test production image, caching webpack & running final checks..." - COMMAND "${DOCKER_EXE}" run -dit -p 6969:10240 --name production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}" - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 35 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_release_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_relwithdebinfo_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_debug_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69 - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Passed all tests! Creating final production image..." - COMMAND "${DOCKER_EXE}" stop production-ce-orphan-cache-webpack - COMMAND "${DOCKER_EXE}" commit -m "Perform tests, cache webpack build" production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_IMAGE}" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "Created final `${COMPOSE_NSC_PRODUCTION_IMAGE}` production image!" - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "To run the production image, execute: 'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'," - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'." - COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "The production image can be pushed safely to the public registry." - WORKING_DIRECTORY "${OUTPUT_DIRECTORY}" +message(STATUS "OK! NSC container is healthy.") +message(STATUS "Type \"@NBL_CE_URL@\" in your browser to use NSC with Godbolt!") +]=] INSTRUCTIONS @ONLY) + +set(SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$.cmake") +file(GENERATE OUTPUT ${SCRIPT_FILE} CONTENT "${INSTRUCTIONS}") + +add_custom_target(run-compiler-explorer ALL + COMMAND "${CMAKE_COMMAND}" -P ${SCRIPT_FILE} VERBATIM - USES_TERMINAL + COMMAND_EXPAND_LISTS ) add_dependencies(run-compiler-explorer nsc) set_target_properties(run-compiler-explorer PROPERTIES FOLDER "Godbolt") -set_target_properties(is-compiler-explorer-running PROPERTIES FOLDER "Godbolt") -set_target_properties(create-production-compiler-explorer PROPERTIES FOLDER "Godbolt") endif() \ No newline at end of file diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md index 21f8f4e06d..d44eea9f81 100644 --- a/tools/nsc/docker/README.md +++ b/tools/nsc/docker/README.md @@ -1,16 +1,105 @@ -# NSC Docker Godbolt +# NSC & Godbolt integration -## Run NSC tool straight from build directory in compiler explorer docker container! +## Run Compiler Explorer with NSC tool in docker container! -Currently only Windows platform with target *x86_64* architecture is supported. Tested with Hyper-V isolation mode. +https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263 -### Requirements +

+ + Image Status + + Build Status + + License: Apache 2.0 + + Join our Discord +

-- [***Docker Desktop***](https://www.docker.com/products/docker-desktop/) +## Requirements -### How To +- Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers +- [Windows, Windows Server Core or Windows Server]() with **minumum** x86_64 10.0.20348 build (2022 distributions) -Switch docker to windows containers, configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` option (recommended Visual Studio generator) & build `run-compiler-explorer` target. After the build completes type `localhost` in your browser. +> [!TIP] +> type `cmd /ver` to see your build version + +> [!WARNING] +> You cannot run it on Windows Home Edition as it doesn't have `Containers` feature, visit Microsoft [docs]() for more details + +> [!CAUTION] +> Hyper-V is **NOT** supported, you must run NSC Godbolt container as process + +## How to run image + +> [!IMPORTANT] +> If using Docker Desktop - first make sure you have switched to `Containers for Windows`, see image bellow. If you are CLI user and have client & daemon headless then use appropriate windows build context. ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png) +> [!CAUTION] +> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise it will fail in runtime, see the [compose]() file for more details + +### from container registry + +execute + +```powershell +curl -L https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/master/compose.yml | docker compose -f - up +``` + +or in Nabla checkout + +```powershell +docker compose up +``` + +and type `localhost` in your browser. + +### from Nabla pipeline workflow artifacts + +> [!NOTE] +> We publish container images to the GitHub Container Registry that include **only the Release variant** of NSC executables built with **MSVC**. +> However, our CI pipelines **build and test all configurations**. Compressed images for each configuration are uploaded as **workflow artifacts**. +> Look for artifacts named: +> `-msvc--nsc-godbolt-image` + +> [!NOTE] +> To decompress image artifact you need [zstd]() + +Download workflow image artifact, unzip and + +```powershell +zstd -d < -msvc--nsc-godbolt-image.tar.zst | docker load +``` + +
+Docker load example (click to expand) + +``` +C:\Users\anastaziuk\Desktop\DevshGraphicsProgramming\Nabla\tools\nsc\docker>zstd -d < run-windows-17.13.6-msvc-Debug-nsc-godbolt-image.tar.zst | docker load +b2ebf78c3627: Loading layer [==================================================>] 3.149MB/3.149MB +4c201e14cc01: Loading layer [==================================================>] 77.4MB/77.4MB +68a216251b8f: Loading layer [==================================================>] 61.95kB/61.95kB +7a4e13ca4c4e: Loading layer [==================================================>] 52.74kB/52.74kB +634001f55b21: Loading layer [==================================================>] 52.74kB/52.74kB +6a609178bb9a: Loading layer [==================================================>] 52.74kB/52.74kB +3d7afb042308: Loading layer [==================================================>] 52.74kB/52.74kB +ca034d7bc58a: Loading layer [==================================================>] 52.74kB/52.74kB +55b4134a1ae9: Loading layer [==================================================>] 52.74kB/52.74kB +0648adff3faa: Loading layer [==================================================>] 52.74kB/52.74kB +Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6 +``` + +
+ +copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) then execute + +``` +docker compose -f override-compose.yml up +``` + +and type `localhost` in your browser. + +## How to build image + +Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target.