diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml
new file mode 100644
index 0000000000..3e8e0b4dd0
--- /dev/null
+++ b/.github/workflows/build-nabla.yml
@@ -0,0 +1,269 @@
+name: Build Nabla Workflow
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: push-lock-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-windows:
+    runs-on: windows-2022
+
+    env:
+      image: ghcr.io/devsh-graphics-programming/docker-nanoserver-msvc-winsdk
+      entry: pwsh.exe
+      cmd: -NoLogo -NoProfile -ExecutionPolicy Bypass
+      mount: C:\mount\nabla
+      binary: C:\mount\nabla\build-ct
+      install: build-ct\install
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # vendor: [msvc, clangcl]
+        # TODO: Yas please fix ClangCL, we have a few new compile errors
+        # if we build MSVC then build "run-compiler-explorer" target, for ClangCL build just "nsc"
+        vendor: [msvc]
+        config: [Release, Debug, RelWithDebInfo]
+        tag: ['17.13.6']
+
+    steps:
+      - name: Environment Setup
+        run: |
+            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
+            Add-MpPreference -ExclusionExtension "*.*"
+            Add-MpPreference -ExclusionProcess "docker.exe"
+            Add-MpPreference -ExclusionProcess "dockerd.exe"
+            Set-MpPreference -RemediationScheduleDay 8
+            Set-MpPreference -DisableRealtimeMonitoring $true
+            Set-MpPreference -DisableRemovableDriveScanning $true
+            Set-MpPreference -DisableArchiveScanning $true
+            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+
+            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+              docker network create --driver nat docker_default
+              if ($LASTEXITCODE -ne 0) { exit 1 }
+            }
+
+      - name: Set prefix
+        id: set-prefix
+        shell: pwsh
+        run: |
+          $prefix = "run-windows-${{ matrix.tag }}-${{ matrix.vendor }}-${{ matrix.config }}"
+          $owner = "${{ github.repository_owner }}"
+          $package = "nabla-shader-compiler-godbolt"
+          $tag = "build-${{ matrix.vendor }}-${{ matrix.config }}-${{ matrix.tag }}"
+          $nscTargetTaggedImage = "ghcr.io/${owner}/${package}:${tag}".ToLower()
+          $nscTargetTaggedImageLatest = "ghcr.io/${owner}/${package}:latest".ToLower()
+
+          $shouldPushImage = (
+            "${{ github.ref }}" -eq "refs/heads/master" -and
+            "${{ matrix.vendor }}" -eq "msvc" -and
+            "${{ matrix.config }}" -eq "Release"
+          )
+
+          Write-Host "::notice::Should push image? $shouldPushImage"
+          
+          "prefix=$prefix" >> $env:GITHUB_OUTPUT
+          "nscTargetTaggedImage=$nscTargetTaggedImage" >> $env:GITHUB_OUTPUT
+          "nscTargetTaggedImageLatest=$nscTargetTaggedImageLatest" >> $env:GITHUB_OUTPUT
+          "shouldPushImage=$shouldPushImage" >> $env:GITHUB_OUTPUT
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Pull Image
+        run: |
+          docker pull "${{ env.image }}:${{ matrix.tag }}"
+
+      - name: Run Container
+        run: |
+          $ctx = docker context show
+          $dockerHost = (docker context inspect $ctx | ConvertFrom-Json).Endpoints.docker.Host
+          $pipeName = [regex]::Match($dockerHost, '/pipe/(?<n>.+)$').Groups['n'].Value
+          $pipeHost = "\\.\pipe\$pipeName"
+          
+          docker run `
+            --entrypoint ${{ env.entry }} -di --isolation process `
+            --env-file .\docker\ci-windows.env `
+            --env-file .\docker\ninja.env `
+            --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" `
+            --name orphan --network docker_default `
+            -v "${{ github.workspace }}:${{ env.mount }}" `
+            -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" `
+            -w "${{ env.mount }}" `
+            "${{ env.image }}:${{ matrix.tag }}" `
+            ${{ env.cmd }}
+
+      - name: Inspect Container
+        run: |
+          docker inspect orphan
+
+      - name: Container – Unpack Packages
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} C:\unpack.ps1
+
+      - name: Container – Configure Project with CMake
+        run: |
+          mkdir profiling
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake `
+              --preset ci-configure-dynamic-${{ matrix.vendor }} `
+              --profiling-output=profiling/cmake-profiling.json `
+              --profiling-format=google-trace
+
+      - name: Container – Build NSC
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --build `
+              --preset ci-build-dynamic-${{ matrix.vendor }} `
+              -t run-compiler-explorer --config ${{ matrix.config }}
+
+      - name: Container – Install NSC
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --install `
+              ${{ env.binary }} --config ${{ matrix.config }} `
+              --component Runtimes --prefix ${{ env.install }}
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command cmake --install `
+              ${{ env.binary }} --config ${{ matrix.config }} `
+              --component Executables --prefix ${{ env.install }}
+
+      - name: Container – Save NSC Image
+        run: |
+          docker exec orphan `
+            ${{ env.entry }} ${{ env.cmd }} -Command docker `
+              save ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} | zstd -T0 -3 -f -o ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst
+
+      - name: Package left workflow artifacts
+        run: |
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-profiling.tar" profiling
+          tar -cvf "${{ steps.set-prefix.outputs.prefix }}-install.tar" ${{ env.install }}
+
+      - name: Upload NSC Godbolt Image artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image
+          path: ${{ steps.set-prefix.outputs.prefix }}-nsc-godbolt-image.tar.zst
+          compression-level: 0
+
+      - name: Upload profiling artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-profiling
+          path: ${{ steps.set-prefix.outputs.prefix }}-profiling.tar
+
+      - name: Upload install artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ steps.set-prefix.outputs.prefix }}-install
+          path: ${{ steps.set-prefix.outputs.prefix }}-install.tar
+
+      - name: Login to GHCR
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: echo "${{ secrets.CR_PAT }}" | docker login ghcr.io -u $env:GITHUB_ACTOR --password-stdin
+
+      - name: Tag Latest image
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: |
+          docker tag ${{ steps.set-prefix.outputs.nscTargetTaggedImage }} ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
+
+      - name: Push images to GHCR
+        if: steps.set-prefix.outputs.shouldPushImage == 'True'
+        run: |
+          docker push ${{ steps.set-prefix.outputs.nscTargetTaggedImageLatest }}
+
+  update-badges:
+    name: Update Build & Image Badges
+    if: ${{ always() && github.ref == 'refs/heads/master' }}
+    needs: build-windows
+    runs-on: windows-2022
+    permissions:
+      contents: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Create Build Badge
+        run: |
+          $jobStatus = "${{ needs.build-windows.result }}"
+          $buildMsg = if ($jobStatus -eq "success") { "passing" } else { "failing" }
+          $buildColor = if ($jobStatus -eq "success") { "brightgreen" } else { "red" }
+
+          $buildBadge = @{
+            schemaVersion = 1
+            label = "build"
+            message = $buildMsg
+            color = $buildColor
+          } | ConvertTo-Json -Depth 2
+
+          $buildPath = ".badge-public/nabla"
+          New-Item -ItemType Directory -Path $buildPath -Force | Out-Null
+          $buildBadge | Set-Content -Path "$buildPath/build.json" -Encoding utf8
+
+      - name: Create Image Size Badge
+        run: |
+          $owner = "${{ github.repository_owner }}"
+          $package = "nabla-shader-compiler-godbolt"
+          $image = "ghcr.io/${owner}/${package}:latest".ToLower()
+          $manifest = docker manifest inspect $image | ConvertFrom-Json
+
+          if ($manifest.manifests) {
+            $totalSize = ($manifest.manifests | Measure-Object -Property size -Sum).Sum
+          } elseif ($manifest.layers) {
+            $totalSize = ($manifest.layers | Measure-Object -Property size -Sum).Sum
+          } else {
+            Write-Error "No valid size information found in manifest."
+            exit 1
+          }
+
+          $sizeMB = [Math]::Round($totalSize / 1MB, 2)
+          $size = "$sizeMB MB"
+
+          $imageBadge = @{
+            schemaVersion = 1
+            label = $image
+            message = $size
+            color = "blue"
+          } | ConvertTo-Json -Depth 2
+
+          $imagePath = ".badge-public/packages/nabla-shader-compiler-nsc"
+          New-Item -ItemType Directory -Path $imagePath -Force | Out-Null
+          $imageBadge | Set-Content -Path "$imagePath/image-badge.json" -Encoding utf8
+
+      - name: Deploy Badges
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_branch: badges
+          publish_dir: .badge-public
+          keep_files: true
+          commit_message: "[CI] badges update"
+
+  deploy-production:
+    name: Deploy to production host
+    if: ${{ always() && github.ref == 'refs/heads/master' }}
+    needs: build-windows
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Pull latest images, re-run containers
+        uses: appleboy/ssh-action@v1
+        with:
+          host: ${{ secrets.CE_HOST }}
+          username: ${{ secrets.CE_USER }}
+          key: ${{ secrets.CE_KEY }}
+          script: |
+            powershell -NoLogo -NoProfile -ExecutionPolicy Bypass -NoExit -File C:\Scripts\startup-docker.ps1
diff --git a/.github/workflows/run-nsc.yml b/.github/workflows/run-nsc.yml
new file mode 100644
index 0000000000..d5f9f74c2b
--- /dev/null
+++ b/.github/workflows/run-nsc.yml
@@ -0,0 +1,264 @@
+name: Run NSC Godbolt Container
+
+on:
+  workflow_dispatch:
+    inputs:
+      run_id:
+        description: "The id of the workflow run where the desired download artifact was uploaded from"
+        required: true
+      build_config:
+        description: "Build configuration (Release / RelWithDebInfo / Debug)"
+        required: true
+        default: "Release"
+        type: choice
+        options:
+          - Release
+          - RelWithDebInfo
+          - Debug
+      tunnelDurationHours:
+        description: "Hours amount the restricted tunnel should stay up"
+        required: true
+        default: "1"
+        type: choice
+        options:
+          - "1"
+          - "2"
+          - "3"
+          - "4"
+          - "5"
+      withDiscordMSG:
+        description: "Send Discord message after tunnel is up"
+        required: true
+        default: true
+        type: boolean
+
+jobs:
+  run-container:
+    runs-on: windows-2022
+    env:
+      DISCORD_WEBHOOK: ${{ secrets.DC_ACTIONS_WEBHOOK }}
+
+    steps:
+      - name: Environment Setup
+        run: |
+            Add-MpPreference -ExclusionPath "${{ github.workspace }}"
+            Add-MpPreference -ExclusionExtension "*.*"
+            Add-MpPreference -ExclusionProcess "docker.exe"
+            Add-MpPreference -ExclusionProcess "dockerd.exe"
+            Set-MpPreference -RemediationScheduleDay 8
+            Set-MpPreference -DisableRealtimeMonitoring $true
+            Set-MpPreference -DisableRemovableDriveScanning $true
+            Set-MpPreference -DisableArchiveScanning $true
+            Set-MpPreference -DisableScanningMappedNetworkDrivesForFullScan $true
+
+            if (-not (docker network ls --format '{{.Name}}' | Where-Object { $_ -eq 'docker_default' })) {
+              docker network create --driver nat docker_default
+              if ($LASTEXITCODE -ne 0) { exit 1 }
+            }
+
+            $sendDiscord = "${{ inputs.withDiscordMSG }}" -eq "true"
+            Write-Host "::notice::Should send discord message? $sendDiscord"
+
+      - name: Download Restricted Reverse Proxy binaries, setup NGINX config
+        run: |
+          Invoke-WebRequest -Uri https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -OutFile cloudflared.exe
+          Invoke-WebRequest -Uri "https://nginx.org/download/nginx-1.24.0.zip" -OutFile nginx.zip
+          Expand-Archive nginx.zip -DestinationPath nginx
+      
+          Remove-Item -Recurse -Force "nginx/nginx-1.24.0/conf"
+          New-Item -ItemType Directory -Path "nginx/nginx-1.24.0/conf" -Force | Out-Null
+
+          '${{ secrets.NSC_BASIC_AUTH_HTPASSWD }}' | Out-File nginx/nginx-1.24.0/conf/.htpasswd -Encoding ascii
+          $htpasswdPath = (Resolve-Path "nginx/nginx-1.24.0/conf/.htpasswd").Path -replace '\\', '/'
+      
+          @"
+          events {}
+      
+          http {
+            server {
+              listen 10241;
+      
+              location / {
+                auth_basic "Restricted Compiler Explorer access for Development & NSC Artifact Tests, downloaded from Nabla actions pipeline";
+                auth_basic_user_file "$htpasswdPath";
+      
+                proxy_pass http://127.0.0.1:10240;
+                proxy_set_header Host `$host;
+                proxy_set_header X-Real-IP `$remote_addr;
+              }
+            }
+          }
+          "@ | Out-File nginx/nginx-1.24.0/conf/nginx.conf -Encoding ascii
+      
+          Write-Host "::group::Generated nginx.conf"
+          Get-Content nginx/nginx-1.24.0/conf/nginx.conf
+          Write-Host "::endgroup::"
+          
+          & "nginx/nginx-1.24.0/nginx.exe" -t -p "nginx/nginx-1.24.0" -c "conf/nginx.conf"
+
+      - name: Download NSC Godbolt artifact
+        uses: actions/download-artifact@v4
+        with:
+          run-id: ${{ inputs.run_id }}
+          pattern: run-windows-*-msvc-${{ inputs.build_config }}-nsc-godbolt-image
+          path: artifact
+          merge-multiple: true
+          github-token: ${{ secrets.READ_PAT }}
+          repository: Devsh-Graphics-Programming/Nabla
+
+      - name: Decompress .tar.zst
+        run: |
+          Get-ChildItem artifact -Filter *.tar.zst | ForEach-Object {
+            $output = $_.FullName -replace '\.zst$', ''
+            zstd -d "$($_.FullName)" -o "$output"
+          }
+
+      - name: Load Docker image
+        run: |
+          $image = Get-ChildItem artifact -Filter *.tar | Select-Object -First 1
+          docker load -i $image.FullName
+
+      - name: Generate and run Docker Compose with matched image
+        run: |
+          $imageName = docker image ls --format "{{.Repository}}:{{.Tag}}" |
+            Where-Object { $_ -like "ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*" } |
+            Select-Object -First 1
+
+          if (-not $imageName) {
+            Write-Error "Could not find image with tag matching ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:build-*"
+            exit 1
+          }
+
+          Write-Host "Found image: $imageName"
+
+          @"
+          services:
+            nsc:
+              container_name: nsc-godbolt
+              image: $imageName
+              isolation: process
+              ports:
+                - "10240:10240"
+              volumes:
+                - type: bind
+                  source: C:\Windows\Globalization\ICU
+                  target: C:\Windows\Globalization\ICU
+                  read_only: true
+                - type: bind
+                  source: C:\Windows\System32
+                  target: C:\mount\Windows\System32
+                  read_only: true
+              networks:
+                - docker_default
+                
+          networks:
+            docker_default:
+              external: true
+          "@ | Set-Content compose.generated.yml
+
+          docker compose -f compose.generated.yml up -d
+
+      - name: Wait for NSC container response on port
+        run: |
+          $maxRetries = 24
+          $retryDelay = 5
+          $success = $false
+      
+          for ($i = 0; $i -lt $maxRetries; $i++) {
+            try {
+              $response = Invoke-WebRequest -Uri "http://localhost:10240" -UseBasicParsing -TimeoutSec 5
+              if ($response.StatusCode -eq 200) {
+                Write-Host "NSC container is up listening on port 10240 and responding."
+                $success = $true
+                break
+              } else {
+                Write-Host "Received HTTP $($response.StatusCode), retrying..."
+              }
+            } catch {
+              Write-Host "NSC container is not responding on port 10240, retrying..."
+            }
+            Start-Sleep -Seconds $retryDelay
+          }
+      
+          if (-not $success) {
+            Write-Error "No response from NSC container on port 10240, timeout."
+            exit 1
+          }
+          
+      - name: Print NSC container logs
+        run: |
+          docker logs nsc-godbolt
+
+      - name: Start Restricted Tunnel
+        env:
+          DISCORD_ENABLED: ${{ inputs.withDiscordMSG }}
+          TUNNEL_DURATION_HOURS: ${{ inputs.tunnelDurationHours }}
+        run: |
+          Start-Process -NoNewWindow -FilePath .\nginx\nginx-1.24.0\nginx.exe -ArgumentList '-p', (Join-Path $PWD 'nginx/nginx-1.24.0'), '-c', 'conf/nginx.conf'  
+          Start-Process -NoNewWindow -FilePath .\cloudflared.exe -ArgumentList "tunnel", "--url", "http://localhost:10241", "--logfile", "cf.log"
+          netstat -an | findstr 10241
+          
+          $tries = 60
+          $url = $null
+          
+          while ($tries -gt 0) {
+              if (Test-Path cf.log) {
+                  $log = Get-Content cf.log
+                  foreach ($line in $log) {
+                      if ($line -match 'https:\/\/[a-zA-Z0-9\-]+\.trycloudflare\.com') {
+                          $url = $Matches[0]
+                          Write-Host "::notice title=Tunnel URL::$url"
+                          break
+                      }
+                  }
+                  if ($url) { break }
+              }
+              Start-Sleep -Seconds 1
+              $tries -= 1
+          }
+
+          if (-not $url) {
+              Write-Error "Could not get tunnel URL from cloudflared log"
+              exit 1
+          }
+
+          $webhookUrl = "$env:DISCORD_WEBHOOK"
+          $thisWorkflowRunID = "${{ github.run_id }}"
+          $artifactWorkflowRunID = "${{ inputs.run_id }}"
+          $thisWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$thisWorkflowRunID"
+          $artifactWorkflowRunURL = "https://github.com/${{ github.repository }}/actions/runs/$artifactWorkflowRunID"
+          $actor = "$env:GITHUB_ACTOR"
+          $sendDiscord = "$env:DISCORD_ENABLED" -eq "true"
+          $hours = [int]$env:TUNNEL_DURATION_HOURS
+          $duration = $hours * 3600
+
+          Write-Host "Blocking job for $hours hours"
+
+          $description = @"
+          - tunnel opened for $hours hours, click [here](<$url>) to connect
+          - requires authentication
+          - workflow [logs #$thisWorkflowRunID](<$thisWorkflowRunURL>)
+          - image downloaded from [run #$artifactWorkflowRunID](<$artifactWorkflowRunURL>)
+          - dispatched by $actor
+          "@
+
+          $payload = @{
+              embeds = @(
+                  @{
+                      title = "Running NSC Godbolt Container"
+                      description = $description
+                      color = 15844367
+                      footer = @{ text = "sent from GitHub Actions runner" }
+                      timestamp = (Get-Date).ToString("o")
+                  }
+              )
+          } | ConvertTo-Json -Depth 10
+
+          if ($sendDiscord) {
+              Write-Host "Sending Discord webhook..."
+              Invoke-RestMethod -Uri $webhookUrl -Method Post -ContentType 'application/json' -Body $payload
+          } else {
+              Write-Host "Discord webhook disabled"
+          }
+          
+          Start-Sleep -Seconds $duration
diff --git a/.gitmodules b/.gitmodules
index 8edc1cead9..00482441de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -27,9 +27,6 @@
 	path = 3rdparty/libexpat
 	url = git@github.com:Devsh-Graphics-Programming/libexpat.git
 	branch = master
-[submodule "3rdparty/glm"]
-	path = 3rdparty/glm
-	url = git@github.com:AnastaZIuk/glm.git
 [submodule "3rdparty/freetype2"]
 	path = 3rdparty/freetype2
 	url = git@github.com:Devsh-Graphics-Programming/freetype.git
@@ -54,6 +51,7 @@
 [submodule "3rdparty/glTFSampleModels"]
 	path = 3rdparty/glTFSampleModels
 	url = git@github.com:Devsh-Graphics-Programming/glTF-Sample-Models.git
+	update = none
 [submodule "3rdparty/nbl_spirv_cross"]
 	path = 3rdparty/nbl_spirv_cross
 	url = git@github.com:devshgraphicsprogramming/SPIRV-Cross.git
@@ -89,7 +87,7 @@
 	url = git@github.com:Devsh-Graphics-Programming/Nabla-Continous-Integration-Python-Framework.git
 [submodule "3rdparty/boost/superproject"]
 	path = 3rdparty/boost/superproject
-	url = git@github.com:boostorg/boost.git
+	url = ../boost.git
 [submodule "3rdparty/argparse"]
 	path = 3rdparty/argparse
 	url = git@github.com:p-ranav/argparse.git
@@ -117,3 +115,9 @@
 [submodule "docker/compiler-explorer"]
 	path = docker/compiler-explorer
 	url = git@github.com:Devsh-Graphics-Programming/Compiler-Explorer-Docker.git
+[submodule "3rdparty/glm"]
+	path = 3rdparty/glm
+	url = git@github.com:Devsh-Graphics-Programming/glm.git
+[submodule "docker/msvc-winsdk"]
+	path = docker/msvc-winsdk
+	url = ../docker-nanoserver-msvc-winsdk
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
index 56752880ae..5bd2d6859f 100755
--- a/3rdparty/CMakeLists.txt
+++ b/3rdparty/CMakeLists.txt
@@ -4,6 +4,9 @@
 
 include(../cmake/common.cmake)
 
+project(Nabla-3rdparty LANGUAGES CXX C)
+enable_language(C CXX ASM ASM_NASM)
+
 option(NBL_FORCE_RELEASE_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to Release" OFF)
 option(NBL_FORCE_RELWITHDEBINFO_3RDPARTY "Force map 3rdaprty's configuration regardless Nabla configuration to RelWithDebInfo" OFF)
 
@@ -231,7 +234,7 @@ if(_NBL_COMPILE_WITH_OPEN_EXR_)
 endif()
 
 
-#gli
+# gli
 option(_NBL_COMPILE_WITH_GLI_ "Build with GLI library" ON)
 if(_NBL_COMPILE_WITH_GLI_)
 	set(_OLD_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
@@ -240,12 +243,23 @@ if(_NBL_COMPILE_WITH_GLI_)
 	set(BUILD_SHARED_LIBS OFF)
 	set(BUILD_STATIC_LIBS OFF)
 	set(BUILD_TESTING OFF)
+	set(GLI_GLM_LOCATION "${CMAKE_CURRENT_SOURCE_DIR}/glm")
 	add_subdirectory(gli gli EXCLUDE_FROM_ALL)
 	set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS})
 	set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS})
 	set(BUILD_TESTING ${_OLD_BUILD_TESTING})
 endif()
 
+set(ENABLE_STATIC_LIB ON)
+set(ENABLE_SHARED_LIB OFF)
+set(ENABLE_EXAMPLES OFF)
+set(ENABLE_DOCS OFF)
+set(ENABLE_APP OFF)
+set(ENABLE_LIB_ONLY ON)
+set(ENABLE_TESTS OFF)
+set(ENABLE_SUMMARY OFF)
+add_subdirectory(bzip2 bzip2 EXCLUDE_FROM_ALL)
+
 add_library(lzma OBJECT
 	lzma/C/Alloc.c
 	lzma/C/LzFind.c
@@ -262,17 +276,6 @@ add_library(lz4 OBJECT
 	lz4/lib/xxhash.c
 )
 
-
-add_library(bzip2 OBJECT
-	bzip2/blocksort.c
-	bzip2/bzlib.c
-	bzip2/compress.c
-	bzip2/crctable.c
-	bzip2/decompress.c
-	bzip2/huffman.c
-	bzip2/randtable.c
-)
-
 add_library(spirv_cross OBJECT
 	nbl_spirv_cross/spirv_cfg.cpp
 	nbl_spirv_cross/spirv_cross.cpp
@@ -419,12 +422,6 @@ add_library(aesGladman OBJECT
 
 add_subdirectory(argparse argparse EXCLUDE_FROM_ALL)
 
-option(GLM_TEST_ENABLE_SIMD_SSE4_2 "Enable SSE 4.2 optimizations" ON)
-option(GLM_TEST_ENABLE "Build unit tests" OFF)
-#add_subdirectory(glm EXCLUDE_FROM_ALL)
-set(BUILD_SHARED_LIBS ${_OLD_BUILD_SHARED_LIBS})
-set(BUILD_STATIC_LIBS ${_OLD_BUILD_STATIC_LIBS})
-
 if (NBL_BUILD_MITSUBA_LOADER)
    option(BUILD_tools "EXPAT: build the xmlwf tool for expat library" OFF)
    option(BUILD_examples "EXPAT: build the examples for expat library" OFF)
@@ -465,7 +462,7 @@ set(NBL_3RDPARTY_TARGETS
 				shaderc_util
 				shaderc
 				jpeg-static
-				bzip2
+				bz2_static
 				simdjson
 				nlohmann_json
 				glslang
@@ -496,11 +493,7 @@ if (NBL_BUILD_IMGUI)
 endif()
 
 foreach(trgt IN LISTS NBL_3RDPARTY_TARGETS)
-		if(NBL_DYNAMIC_MSVC_RUNTIME)
-			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-		else()
-			set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-		endif()
+		set_property(TARGET ${trgt} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 		
 		if(MSVC AND NBL_SANITIZE_ADDRESS)
 			set_property(TARGET ${trgt} PROPERTY COMPILE_OPTIONS /fsanitize=address)
diff --git a/3rdparty/boost/CMakeLists.txt b/3rdparty/boost/CMakeLists.txt
index f3460fe8d6..3c95234b8e 100644
--- a/3rdparty/boost/CMakeLists.txt
+++ b/3rdparty/boost/CMakeLists.txt
@@ -1,3 +1,81 @@
+get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE)
+
+# Boost uses it's own tool for generating dependency list for targets, therefore we 
+# can make sure manually added dependency subdirectories for a library are valid
+# https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep
+
+if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
+	if(NOT WIN32)
+		message(FATAL_ERROR "NBL_BOOST_GENERATE_DEP_LIST only for Windows host!")
+	endif()
+
+	macro(NBL_BOOST_EXECUTE)
+		execute_process(COMMAND ${ARGV} WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject")
+	endmacro()
+
+	NBL_BOOST_EXECUTE(git config --file .gitmodules --get-regexp path OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE)
+	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
+
+	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
+		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
+		list(APPEND BOOST_SUBMODULES "${CMAKE_MATCH_1}")
+	endforeach()
+
+	# sync & force update of all boost modules first for the tool purpose (sry guys who use the tool, you need to clone all, I want to keep it simple)
+	NBL_BOOST_EXECUTE(git submodule sync)
+	list(APPEND BOOST_FORCE_ALL_CONFIG -c url.https://github.com/.insteadOf=git@github.com:)
+	foreach(SUBMODULE ${BOOST_SUBMODULES})
+		list(APPEND BOOST_FORCE_ALL_CONFIG -c submodule.${SUBMODULE}.update=checkout)
+	endforeach()
+
+	NBL_BOOST_EXECUTE(git ${BOOST_FORCE_ALL_CONFIG} submodule update --init --recursive -f)
+
+	# build boost dep executable
+	set(NBL_BOOSTDEP_EXE "boostdep.exe")
+	set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}")
+	if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}")
+		NBL_BOOST_EXECUTE(cmd /C bootstrap.bat)
+		NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build)
+		NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}")
+		NBL_BOOST_EXECUTE(git clean -fdx)
+		NBL_BOOST_EXECUTE(git reset --hard)
+	endif()
+
+	# get wave dependency info
+	NBL_BOOST_EXECUTE("${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave
+		OUTPUT_VARIABLE NBL_OUTPUT_VAR
+	)
+
+	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "${NBL_OUTPUT_VAR}")
+
+	file(STRINGS "${NBL_BOOST_WAVE_DEP_FILE}" NBL_BOOST_LIBS)
+	set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})
+	list(POP_FRONT NBL_BOOST_LIBS)
+	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "#")
+	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)")
+	string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}")
+
+	# update boost .gitmodules configuration, discard all but modules reported by wave
+	# NOTE: you commit this file to version control AND boost's .gitmodules *if got changed*,
+	# use when updating boost to more recent version
+	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
+
+	message(STATUS "Updating boost .gitmodules")
+	foreach(SUBMODULE ${BOOST_SUBMODULES})
+		# 1) fallback, ignore all
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update none)
+	endforeach()
+
+	foreach(NAME ${NBL_BOOST_LIBS})
+		string(REPLACE "/" "_" SUBMODULE "${NAME}")
+		message(STATUS "WAVE BOOST DEP SUBMODULE = ${SUBMODULE}")
+		# 2) pick only submodules reported by wave
+		NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.${SUBMODULE}.update checkout)
+	endforeach()
+	# 3) and the top module itself
+	NBL_BOOST_EXECUTE(git config --file .gitmodules submodule.wave.update checkout)
+endif()
+
 set(BOOST_PREPROCESSOR_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/superproject/libs/preprocessor/include" CACHE PATH "" FORCE)
 
 get_filename_component(_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_ "${BOOST_PREPROCESSOR_INCLUDE}" ABSOLUTE)
@@ -17,8 +95,6 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
     ADD_CUSTOM_BUILTIN_RESOURCES(boostBuiltinResourceData BOOST_RESOURCES_TO_EMBED "${_BOOST_PREPROCESSOR_BR_BUNDLE_SEARCH_DIRECTORY_}" "boost" "boost::builtin" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "${_BOOST_PREPROCESSOR_BR_OUTPUT_DIRECTORY_HEADER_}" "STATIC" "INTERNAL")
 endif()
 
-get_filename_component(NBL_BOOST_WAVE_DEP_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dep/wave.cmake" ABSOLUTE)
-
 if(NOT EXISTS "${NBL_BOOST_WAVE_DEP_FILE}")
 	message(FATAL_ERROR "Internal error, generate NBL_BOOST_WAVE_DEP_FILE by enabling NBL_BOOST_GENERATE_DEP_LIST!")
 endif()
@@ -41,47 +117,4 @@ endforeach()
 
 set(NBL_BOOST_TARGETS 
 	${NBL_BOOST_TARGETS}
-PARENT_SCOPE)
-
-# Boost uses it's own tool for generating dependency list for targets, therefore we 
-# can make sure manually added dependnecy subdirectories for a library are valid
-# https://www.boost.org/doc/libs/1_83_0/tools/boostdep/doc/html/index.html#boostdep.introduction.building_boostdep
-
-if(NBL_BOOST_GENERATE_DEP_LIST) # internal, for Nabla devs
-	if(WIN32)
-		set(NBL_BOOSTDEP_EXE "boostdep.exe")
-	else()
-		set(NBL_BOOSTDEP_EXE "boostdep")
-	endif()
-	
-	set(NBL_BOOSTDEP_EXE_FILEPATH "${CMAKE_CURRENT_BINARY_DIR}/superproject/tools/boostdep/bin/${NBL_BOOSTDEP_EXE}")
-	
-	if(NOT EXISTS "${NBL_BOOSTDEP_EXE_FILEPATH}")
-		macro(NBL_BOOST_EXECUTE)
-			execute_process(COMMAND ${ARGV}
-				WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/superproject"
-			)
-		endmacro()
-
-		NBL_BOOST_EXECUTE(cmd /C bootstrap.bat)
-		NBL_BOOST_EXECUTE(cmd /C b2.exe tools/boostdep/build)
-		NBL_BOOST_EXECUTE("${CMAKE_COMMAND}" -E copy "./dist/bin/${NBL_BOOSTDEP_EXE}" "${NBL_BOOSTDEP_EXE_FILEPATH}")
-		NBL_BOOST_EXECUTE(git clean -fdx)
-		NBL_BOOST_EXECUTE(git reset --hard)
-	endif()
-
-	execute_process(COMMAND "${NBL_BOOSTDEP_EXE_FILEPATH}" --boost-root "${CMAKE_CURRENT_SOURCE_DIR}/superproject" --brief wave
-		OUTPUT_VARIABLE NBL_OUTPUT_VAR
-	)
-
-	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "${NBL_OUTPUT_VAR}")
-
-	file(STRINGS "${NBL_BOOST_WAVE_DEP_FILE}" NBL_BOOST_LIBS)
-	set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})
-	list(POP_FRONT NBL_BOOST_LIBS)
-	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "#")
-	list(FILTER NBL_BOOST_LIBS EXCLUDE REGEX "(unknown)")
-	string(REPLACE "~" "/" NBL_BOOST_LIBS "${NBL_BOOST_LIBS}")
-
-	file(WRITE "${NBL_BOOST_WAVE_DEP_FILE}" "set(NBL_BOOST_LIBS ${NBL_BOOST_LIBS})")
-endif()
+PARENT_SCOPE)
\ No newline at end of file
diff --git a/3rdparty/boost/superproject b/3rdparty/boost/superproject
index 1c4d3531e4..3b9e116eee 160000
--- a/3rdparty/boost/superproject
+++ b/3rdparty/boost/superproject
@@ -1 +1 @@
-Subproject commit 1c4d3531e416a1f72b0e6a5e0f7173f93cf97e92
+Subproject commit 3b9e116eeee85ab8fd0d8e5a97364fff5f02eb86
diff --git a/3rdparty/bzip2 b/3rdparty/bzip2
index c4a14bb87e..f4301b0eac 160000
--- a/3rdparty/bzip2
+++ b/3rdparty/bzip2
@@ -1 +1 @@
-Subproject commit c4a14bb87ee395fb2c69ef5dbb50762fe862517e
+Subproject commit f4301b0eac69eb109c5419813102be6f82d2b73a
diff --git a/3rdparty/dxc/CMakeLists.txt b/3rdparty/dxc/CMakeLists.txt
index 8b48c0e5a6..9432b4df07 100644
--- a/3rdparty/dxc/CMakeLists.txt
+++ b/3rdparty/dxc/CMakeLists.txt
@@ -41,6 +41,7 @@ list(APPEND NBL_DXC_CMAKE_OPTIONS "-DSPIRV_SKIP_EXECUTABLES:BOOL=ON")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DHLSL_ENABLE_DEBUG_ITERATORS:BOOL=ON")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_TOOLS_DIR=${DXC_SPIRV_TOOLS_DIR}")
 list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_SPIRV_HEADERS_DIR=${DXC_SPIRV_HEADERS_DIR}")
+list(APPEND NBL_DXC_CMAKE_OPTIONS "-DDXC_ENABLE_ETW=OFF")
 
 if(NOT NBL_IS_MULTI_CONFIG)
 	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
@@ -63,11 +64,7 @@ if(WIN32)
 	endif()
 endif()
 
-if(NBL_DYNAMIC_MSVC_RUNTIME)
-	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-else()
-	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>")		
-endif()
+list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_MSVC_RUNTIME_LIBRARY:STATIC=MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 
 # perform DXC compile standard requirement test
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -89,18 +86,23 @@ endif()
 
 set(DXC_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/build" CACHE INTERNAL "")
 
-if(MSVC AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja Multi-Config" AND NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
-	execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" "-Ax64" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS}
-		RESULT_VARIABLE DXC_CMAKE_RESULT
-		OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
-	)
-else()
-	execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" -T "${CMAKE_GENERATOR_TOOLSET}" ${NBL_DXC_CMAKE_OPTIONS}
-		RESULT_VARIABLE DXC_CMAKE_RESULT
-		OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
-	)
+if(NOT CMAKE_GENERATOR MATCHES "Ninja*")
+    list(APPEND NBL_DXC_CMAKE_OPTIONS -Ax64)
+endif()
+
+if(CMAKE_GENERATOR_TOOLSET)
+	list(APPEND NBL_DXC_CMAKE_OPTIONS -T "${CMAKE_GENERATOR_TOOLSET}")
 endif()
 
+if(CMAKE_TOOLCHAIN_FILE)
+	list(APPEND NBL_DXC_CMAKE_OPTIONS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+execute_process(COMMAND "${CMAKE_COMMAND}" -C "${CMAKE_CURRENT_SOURCE_DIR}/dxc/cmake/caches/PredefinedParams.cmake" -S "${CMAKE_CURRENT_SOURCE_DIR}/dxc" -B "${DXC_BUILD_DIR}" -G "${CMAKE_GENERATOR}" ${NBL_DXC_CMAKE_OPTIONS}
+	RESULT_VARIABLE DXC_CMAKE_RESULT
+	OUTPUT_VARIABLE DXC_CMAKE_STREAM_PIPE
+)
+
 if(NOT "${DXC_CMAKE_RESULT}" STREQUAL "0")
 	message(FATAL_ERROR "${DXC_CMAKE_STREAM_PIPE}")
 endif()
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5ab4d368b6..71f2766da9 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5ab4d368b666d365217c751f5610b496b828ff96
+Subproject commit 71f2766da918d33d34fefac270fdee983a06dd20
diff --git a/3rdparty/gli b/3rdparty/gli
index 559cbe1ec3..c4e6446d3b 160000
--- a/3rdparty/gli
+++ b/3rdparty/gli
@@ -1 +1 @@
-Subproject commit 559cbe1ec38878e182507d331e0780fbae5baf15
+Subproject commit c4e6446d3b646538026fd5a95533daed952878d4
diff --git a/3rdparty/glm b/3rdparty/glm
index d162eee1e6..2d4c4b4dd3 160000
--- a/3rdparty/glm
+++ b/3rdparty/glm
@@ -1 +1 @@
-Subproject commit d162eee1e6f7c317a09229fe6ceab8ec6ab9a4b4
+Subproject commit 2d4c4b4dd31fde06cfffad7915c2b3006402322f
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c819c644eb..c6664f8085 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,24 @@
-# Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+# Copyright (C) 2018-2025 - DevSH Graphics Programming Sp. z O.O.
 # This file is part of the "Nabla Engine".
 # For conditions of distribution and use, see copyright notice in nabla.h.in or nabla.h
+cmake_minimum_required(VERSION 3.31)
 
-cmake_minimum_required(VERSION 3.29)
-cmake_policy(SET CMP0112 NEW)
-cmake_policy(SET CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
-cmake_policy(SET CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
+# TODO: Yas - once we deploy 4.x we will fire `cmake_policy(VERSION <min>[...<max>])` instead of manually picking policies
+# https://cmake.org/cmake/help/latest/command/cmake_minimum_required.html#policy-version
+# also we should update deps which throw warnings about < 3.10 compatibility
+
+macro(NBL_POLICY P S)
+if(POLICY ${P})
+	cmake_policy(SET ${P} ${S})
+	set(CMAKE_POLICY_DEFAULT_${P} ${S})
+endif()
+endmacro()
+
+NBL_POLICY(CMP0003 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0003.html#cmp0003
+NBL_POLICY(CMP0077 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0077.html#cmp0077
+NBL_POLICY(CMP0112 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0112.html#cmp0112
+NBL_POLICY(CMP0141 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0141.html#policy:CMP0141
+NBL_POLICY(CMP0118 NEW) # https://cmake.org/cmake/help/latest/policy/CMP0118.html#policy:CMP0118
 
 set(NBL_BUILD_ANDROID OFF)
 
@@ -20,29 +33,19 @@ if(MSVC)
 	link_libraries(delayimp)
 endif()
 
+# TODO: TO BE KILLED, keep both in one tree
 option(NBL_STATIC_BUILD "" OFF) # ON for static builds, OFF for shared
-option(NBL_DYNAMIC_MSVC_RUNTIME "" ON)
+
+option(NBL_COMPILER_DYNAMIC_RUNTIME "" ON)
 option(NBL_SANITIZE_ADDRESS OFF)
 
-if(MSVC)
-	if(NBL_SANITIZE_ADDRESS)
-		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>")
-	else()
-		set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<$<CONFIG:Debug>:EditAndContinue>$<$<CONFIG:RelWithDebInfo>:ProgramDatabase>")
-	endif()
-endif()
+set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT $<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>) # ignored on non xMSVC-ABI targets
 
 if(NBL_STATIC_BUILD)
 	message(STATUS "Static Nabla build enabled!")
 else()
-	if(MSVC)
-		if(NBL_DYNAMIC_MSVC_RUNTIME)
-			message(STATUS "Shared Nabla build enabled!")
-		else()
-			message(FATAL_ERROR "Turn NBL_DYNAMIC_MSVC_RUNTIME on! For dynamic Nabla builds dynamic MSVC runtime is mandatory!")
-		endif()
-	else()
-		message(FATAL_ERROR "Nabla can't be built with shared libraries! Please make sure you are targetting Windows OS and MSVC compiler!")
+	if(NOT NBL_COMPILER_DYNAMIC_RUNTIME)
+		message(FATAL_ERROR "Turn NBL_COMPILER_DYNAMIC_RUNTIME on! For dynamic Nabla builds dynamic runtime is mandatory!")
 	endif()
 endif()
 
diff --git a/CMakePresets.json b/CMakePresets.json
index 8d0b62367a..ae56cf1739 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -2,15 +2,14 @@
 	"version": 6,
 	"cmakeMinimumRequired": {
 		"major": 3,
-		"minor": 29,
-		"patch": 2
+		"minor": 31,
+		"patch": 0
 	},
 	"configurePresets": [
 		{
 			"name": "ci-configure-base",
 			"hidden": true,
 			"cacheVariables": {
-				"NBL_CI_MODE": "ON",
 				"NBL_UPDATE_GIT_SUBMODULE": "OFF",
 				"NBL_COMPILE_WITH_CUDA": "OFF",
 				"NBL_BUILD_OPTIX": "OFF",
@@ -19,8 +18,10 @@
 				"_NBL_COMPILE_WITH_OPEN_EXR_": "ON",
 				"NBL_EXPLICIT_MODULE_LOAD_LOG": "ON",
 				"NBL_CPACK_NO_BUILD_DIRECTORY_MODULES": "ON",
-				"NBL_RUN_TESTS": "ON",
-				"NBL_CPACK_CI": "ON"
+				"NBL_CPACK_CI": "ON",
+				"GIT_FAIL_IF_NONZERO_EXIT": "OFF",
+				"NBL_DOCKER_DIND_BUILD": "ON",
+				"NBL_CE_PUBLISH_PORT": "10240"
 			}
 		},
 		{
@@ -46,7 +47,7 @@
 			"hidden": true,
 			"inherits": "ci-configure-static-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "OFF"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "OFF"
 			},
 			"condition": {
 				"type": "allOf",
@@ -69,7 +70,7 @@
 			"hidden": true,
 			"inherits": "ci-configure-dynamic-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "ON"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "ON"
 			},
 			"condition": {
 				"type": "allOf",
@@ -90,37 +91,35 @@
 		{
 			"name": "ci-configure-static-msvc",
 			"inherits": "ci-configure-static-windows-base",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Configure as static library with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"generator": "Visual Studio 17 2022",
-			"toolset": "v143"
+			"generator": "Ninja Multi-Config",
+			"cacheVariables": {
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake",
+				"NBL_ENABLE_DOCKER_INTEGRATION": "ON"
+			}
 		},
 		{
 			"name": "ci-configure-dynamic-msvc",
 			"inherits": "ci-configure-dynamic-windows-base",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Configure as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"generator": "Visual Studio 17 2022",
-			"toolset": "v143"
+			"generator": "Ninja Multi-Config",
+			"cacheVariables": {
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-msvc-toolchain.cmake",
+				"NBL_ENABLE_DOCKER_INTEGRATION": "ON"
+			}
 		},
 		{
-			"name": "ci-configure-static-ninja-multi",
+			"name": "ci-configure-static-clangcl",
 			"inherits": "ci-configure-static-windows-base",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Configure as static library with Ninja multi-config generator",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake"
 			}
 		},
 		{
-			"name": "ci-configure-dynamic-ninja-multi",
+			"name": "ci-configure-dynamic-clangcl",
 			"inherits": "ci-configure-dynamic-windows-base",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Configure as dynamic library with Ninja multi-config generator",
 			"generator": "Ninja Multi-Config",
 			"cacheVariables": {
-				"CMAKE_EXPORT_COMPILE_COMMANDS": "ON"
+				"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/docker/msvc-winsdk/cmake/winsdk-clangcl-toolchain.cmake"
 			}
 		},
 		{
@@ -156,7 +155,7 @@
 			"hidden": true,
 			"inherits": "user-configure-static-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "OFF"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "OFF"
 			},
 			"condition": {
 				"type": "equals",
@@ -169,7 +168,7 @@
 			"hidden": true,
 			"inherits": "user-configure-dynamic-base",
 			"cacheVariables": {
-				"NBL_DYNAMIC_MSVC_RUNTIME": "ON"
+				"NBL_COMPILER_DYNAMIC_RUNTIME": "ON"
 			},
 			"condition": {
 				"type": "equals",
@@ -193,6 +192,22 @@
 			"generator": "Visual Studio 17 2022",
 			"toolset": "v143"
 		},
+		{
+			"name": "user-configure-static-clangcl",
+			"inherits": "user-configure-static-windows-base",
+			"displayName": "[USER]: Static library target, Visual Studio 17 2022 generator, ClangCL toolset",
+			"description": "Configure as static library with Visual Studio 17 2022 generator and ClangCL toolset",
+			"generator": "Visual Studio 17 2022",
+			"toolset": "ClangCL"
+		},
+		{
+			"name": "user-configure-dynamic-clangcl",
+			"inherits": "user-configure-dynamic-windows-base",
+			"displayName": "[USER]: Dynamic library target, Visual Studio 17 2022 generator, ClangCL toolset",
+			"description": "Configure as dynamic library with Visual Studio 17 2022 generator and ClangCL toolset",
+			"generator": "Visual Studio 17 2022",
+			"toolset": "ClangCL"
+		},
 		{
 			"name": "user-configure-static-ninja-multi",
 			"inherits": "user-configure-static-windows-base",
@@ -303,8 +318,6 @@
 			"configurePreset": "ci-configure-static-msvc",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Build Nabla as static library with Visual Studio 17 2022 generator and MSVC v143 toolset",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -316,21 +329,17 @@
 			"configurePreset": "ci-configure-dynamic-msvc",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Build Nabla as dynamic library with Visual Studio 17 2022 generator and MSVC v143 toolset",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
 				"rhs": "ON"
 			}
 		},
-		{
-			"name": "ci-build-static-ninja-multi",
-			"configurePreset": "ci-configure-static-ninja-multi",
+				{
+			"name": "ci-build-static-clangcl",
+			"configurePreset": "ci-configure-static-clangcl",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Build Nabla as static library with Ninja multi-config generator",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -338,12 +347,10 @@
 			}
 		},
 		{
-			"name": "ci-build-dynamic-ninja-multi",
-			"configurePreset": "ci-configure-dynamic-ninja-multi",
+			"name": "ci-build-dynamic-clangcl",
+			"configurePreset": "ci-configure-dynamic-clangcl",
 			"inheritConfigureEnvironment": true,
 			"inherits": "build-windows-base",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Build Nabla as dynamic library with Ninja multi-config generator",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -450,8 +457,6 @@
 			"name": "ci-package-static-msvc",
 			"inherits": "ci-package-windows-base",
 			"configurePreset": "ci-configure-static-msvc",
-			"displayName": "[CI]: Static library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Package Nabla as static library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
@@ -462,32 +467,6 @@
 			"name": "ci-package-dynamic-msvc",
 			"inherits": "ci-package-windows-base",
 			"configurePreset": "ci-configure-dynamic-msvc",
-			"displayName": "[CI]: Dynamic library target, Visual Studio 17 2022 generator, MSVC v143 toolset",
-			"description": "Package Nabla as dynamic library compiled with Visual Studio 17 2022 generator and MSVC v143 toolset",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-package-static-ninja-multi",
-			"inherits": "ci-package-windows-base",
-			"configurePreset": "ci-configure-static-ninja-multi",
-			"displayName": "[CI]: Static library target, Ninja multi-config generator",
-			"description": "Package Nabla as static library compiled with Ninja multi-config generator",
-			"condition": {
-				"type": "equals",
-				"lhs": "$env{NBL_CI_MODE}",
-				"rhs": "ON"
-			}
-		},
-		{
-			"name": "ci-package-dynamic-ninja-multi",
-			"inherits": "ci-package-windows-base",
-			"configurePreset": "ci-configure-dynamic-ninja-multi",
-			"displayName": "[CI]: Dynamic library target, Ninja multi-config generator",
-			"description": "Package Nabla as dynamic library compiled with Ninja multi-config generator",
 			"condition": {
 				"type": "equals",
 				"lhs": "$env{NBL_CI_MODE}",
diff --git a/README.md b/README.md
index 2b85c9c460..a696846b30 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,15 @@
 <div align="center">
    <img alt="Click to see the source" height="200" src="nabla-glow.svg" width="200" />
 </div>
-<div align="center">
-  <img alt="Click to see the source" height="200" src="https://github.com/user-attachments/assets/81f15d9b-0b9b-4ecc-981a-6e43e3b4c49b" width="700" />
-</div>
+
+<p align="center">
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/nabla/build.json" alt="Build Status" /></a>
+  <a href="https://opensource.org/licenses/Apache-2.0">
+    <img src="https://img.shields.io/badge/license-Apache%202.0-blue" alt="License: Apache 2.0" /></a>
+  <a href="https://discord.gg/krsBcABm7u">
+    <img src="https://img.shields.io/discord/308323056592486420?label=discord&logo=discord&logoColor=white&color=7289DA" alt="Join our Discord" /></a>
+</p>
 
 # Table of Contents
 
diff --git a/cmake/adjust/flags.cmake b/cmake/adjust/flags.cmake
index 8bf2a77893..1e67914ae0 100644
--- a/cmake/adjust/flags.cmake
+++ b/cmake/adjust/flags.cmake
@@ -12,45 +12,173 @@ define_property(TARGET PROPERTY NBL_CONFIGURATION_MAP
   BRIEF_DOCS "Stores configuration map for a target, it will evaluate to the configuration it's mapped to"
 )
 
-function(NBL_REQUEST_COMPILE_OPTION_SUPPORT _NBL_COMPILE_OPTION_)
-    set(NBL_COMPILE_OPTION "${_NBL_COMPILE_OPTION_}")
+# https://github.com/Kitware/CMake/blob/05e77b8a27033e6fd086456bd6cef28338ff1474/Modules/Internal/CheckCompilerFlag.cmake#L26C7-L26C42
+# must be cached because parse utility clears locals in the CheckCompilerFlag module
+set(CHECK_COMPILER_FLAG_OUTPUT_VARIABLE NBL_COMPILER_FLAG_OUTPUT CACHE INTERNAL "")
 
-    foreach(COMPILER IN ITEMS c cxx)
+# Usage: NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG <LANG;...> CONFIG <CONFIG;...> COMPILE_OPTIONS <OPTIONS;...> LINK_OPTIONS <OPTIONS;...>)
+function(NBL_REQUEST_COMPILE_OPTION_SUPPORT)
+	cmake_parse_arguments(IMPL "REQUIRED" "REQUEST_VAR" "LANG;CONFIG;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN})
+
+	set(DEFAULT_COMPILERS c cxx)
+	set(REQUEST_ALL_OPTIONS_PRESENT True)
+
+	if(NOT IMPL_LANG)
+        list(APPEND IMPL_LANG ${DEFAULT_COMPILERS})
+    endif()
+
+    foreach(COMPILER IN ITEMS ${IMPL_LANG})
         string(TOUPPER "${COMPILER}" COMPILER_UPPER)
 
-        string(REGEX REPLACE "[-=:;/.]" "_" flag_signature "${NBL_COMPILE_OPTION}")
-        set(flag_var "__${COMPILER_UPPER}_Flag_${flag_signature}")
-
-        if(COMPILER STREQUAL "c")
-            check_c_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var})
-        elseif(COMPILER STREQUAL "cxx")
-            check_cxx_compiler_flag("${NBL_COMPILE_OPTION}" ${flag_var})
-        endif()
-
-        if(${flag_var})
-            message(STATUS "Enabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects!")
-            set(NBL_${COMPILER_UPPER}_COMPILE_OPTIONS "${NBL_${COMPILER_UPPER}_COMPILE_OPTIONS};${NBL_COMPILE_OPTION}" PARENT_SCOPE)
-        else()
-            message(STATUS "Disabled \"${NBL_COMPILE_OPTION}\" ${COMPILER_UPPER} compile option for Nabla projects! (no support)")
-        endif()
+		foreach(WHAT_OPTIONS IN ITEMS IMPL_COMPILE_OPTIONS IMPL_LINK_OPTIONS)
+		    if(NOT ${WHAT_OPTIONS})
+				continue()
+			endif()
+
+			set(IMPL_OPTIONS ${${WHAT_OPTIONS}})
+			string(REPLACE IMPL_ "" WHAT_OPTIONS "${WHAT_OPTIONS}")
+
+			foreach(COMPILE_OPTION ${IMPL_OPTIONS})
+				if(IMPL_CONFIG)
+					foreach(CONFIG ${IMPL_CONFIG})
+						# TODO: validate (${CONFIG} \in ${CMAKE_CONFIGURATION_TYPES})
+						string(TOUPPER "${CONFIG}" CONFIG_UPPER)
+						set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}")
+					endforeach()
+				else()
+					set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} "${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}};${COMPILE_OPTION}")
+				endif()
+			endforeach()
+
+			if(IMPL_CONFIG)
+				foreach(CONFIG ${IMPL_CONFIG})
+					string(TOUPPER "${CONFIG}" CONFIG_UPPER)
+					set(NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${CONFIG_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE)
+				endforeach()
+			else()
+				set(NBL_${COMPILER_UPPER}_${WHAT_OPTIONS} ${NBL_${COMPILER_UPPER}_${WHAT_OPTIONS}} PARENT_SCOPE)
+			endif()
+		endforeach()
     endforeach()
 endfunction()
 
 option(NBL_REQUEST_SSE_4_2 "Request compilation with SSE 4.2 instruction set enabled for Nabla projects" ON)
-option(NBL_REQUEST_SSE_AXV2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON)
+option(NBL_REQUEST_SSE_AVX2 "Request compilation with SSE Intel Advanced Vector Extensions 2 for Nabla projects" ON)
 
 # profiles
-if(MSVC)
-	include("${CMAKE_CURRENT_LIST_DIR}/template/windows/msvc.cmake")
-elseif(ANDROID)
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/android.cmake")
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/gnu.cmake")
-elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-	include("${CMAKE_CURRENT_LIST_DIR}/template/unix/clang.cmake")
-else()
-	message(WARNING "UNTESTED COMPILER DETECTED, EXPECT WRONG OPTIMIZATION FLAGS! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues")
-endif()
+foreach(NBL_COMPILER_LANGUAGE IN ITEMS C CXX)
+    # all list of all known by CMake vendors:
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html
+    set(NBL_COMPILER_VENDOR "${CMAKE_${NBL_COMPILER_LANGUAGE}_COMPILER_ID}")
+    set(NBL_PROFILE_NAME "${NBL_COMPILER_LANGUAGE}_${NBL_COMPILER_VENDOR}") # eg. "cxx_MSVC.cmake"
+    set(NBL_PROFILE_PATH "${CMAKE_CURRENT_LIST_DIR}/template/vendor/${NBL_PROFILE_NAME}.cmake")
+
+    include("${NBL_PROFILE_PATH}" RESULT_VARIABLE _NBL_FOUND_)
+
+    if(NOT _NBL_FOUND_)
+        message(WARNING "UNSUPPORTED \"${NBL_COMPILER_LANGUAGE}\" COMPILER LANGUAGE FOR \"${NBL_COMPILER_VENDOR}\" DETECTED, CMAKE CONFIGURATION OR BUILD MAY FAIL AND COMPILE OPTIONS FLAGS WILL NOT BE SET! SUBMIT ISSUE ON GITHUB https://github.com/Devsh-Graphics-Programming/Nabla/issues")
+        continue()
+    endif()
+
+	# a profile MUST define 
+
+    # - "NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS" (configuration dependent)
+    # - "NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS" (global)
+
+	# a profile MUST NOT define
+		# - NBL_${WHAT}_OPTIONS
+		
+	# note: 
+	# - use NBL_REQUEST_COMPILE_OPTION_SUPPORT in profile to creates those vars
+	# - include reset utility in profiles to init vars with empty lists
+
+	# TODO: DEFINITIONS for WHAT to unify the API
+
+	foreach(WHAT COMPILE LINK)
+		set(NBL_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS)
+		set(NBL_OPTIONS_VAR_VALUE ${${NBL_OPTIONS_VAR_NAME}})
+
+		if(NOT DEFINED ${NBL_OPTIONS_VAR_NAME})
+			message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_OPTIONS_VAR_NAME}\"!")
+		endif()
+
+		# update map with configuration dependent compile options
+		foreach(CONFIGURATION IN ITEMS RELEASE RELWITHDEBINFO DEBUG)
+			set(NBL_CONFIGURATION_OPTIONS_VAR_NAME NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_${WHAT}_OPTIONS)
+			set(NBL_CONFIGURATION_OPTIONS_VAR_VALUE ${${NBL_CONFIGURATION_OPTIONS_VAR_NAME}})
+
+			if(NOT DEFINED ${NBL_CONFIGURATION_OPTIONS_VAR_NAME})
+				message(FATAL_ERROR "\"${NBL_PROFILE_PATH}\" did not define \"${NBL_CONFIGURATION_OPTIONS_VAR_NAME}\"!")
+			endif()
+
+			set(NBL_${CONFIGURATION}_${WHAT}_OPTIONS ${NBL_${CONFIGURATION}_${WHAT}_OPTIONS}
+				# note that "${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}" MUST NOT contain ANY 
+				# $<$<CONFIG:<>> generator expression in order to support our configuration mapping features
+				$<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_CONFIGURATION_OPTIONS_VAR_VALUE}>
+			)
+		endforeach()
+
+		# update map with global compile options
+		set(NBL_${WHAT}_OPTIONS ${NBL_${WHAT}_OPTIONS}
+			$<$<${WHAT}_LANGUAGE:${NBL_COMPILER_LANGUAGE}>:${NBL_${NBL_COMPILER_LANGUAGE}_${WHAT}_OPTIONS}>
+		)
+	endforeach()
+
+	block()
+		# validate build with a vendor profile, any warning diagnostic = error
+		# if you hit error it means the profile generates diagnostics due to:
+		# - an option (compile or link) which doesn't exist (typo? check vendor docs)
+		# - a set of options which invalidates an option (eg. MSVC's /INCREMENTAL with /LTCG:incremental is invalid, however linker will emit a warning by default + do a fall-back)
+		# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html#variable:CMAKE_%3CLANG%3E_FLAGS
+		# https://cmake.org/cmake/help/latest/module/CheckCompilerFlag.html#command:check_compiler_flag
+
+		set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS)
+
+		foreach(CONFIGURATION IN ITEMS Release RelWithDebInfo Debug)
+			set(CMAKE_TRY_COMPILE_CONFIGURATION ${CONFIGURATION})
+			string(TOUPPER "${CONFIGURATION}" CONFIGURATION)
+
+			set(TEST_NAME "NBL_${NBL_COMPILER_LANGUAGE}_LANG_${CONFIGURATION}_BUILD_OPTIONS_SUPPORT")
+			set(CMAKE_${NBL_COMPILER_LANGUAGE}_FLAGS_${CONFIGURATION})
+
+			set(COMPILE_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_COMPILE_OPTIONS} ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_COMPILE_OPTIONS})
+			set(LINK_OPTIONS ${NBL_${NBL_COMPILER_LANGUAGE}_${CONFIGURATION}_LINK_OPTIONS})
+			set(COMBINED ${COMPILE_OPTIONS} ${LINK_OPTIONS})
+
+			set(NBL_OUTPUT_FILE "${CMAKE_BINARY_DIR}/.nbl/try-compile/${TEST_NAME}.output") # no hash in output diagnostic file, desired
+			
+			string(SHA1 OPTIONS_HASH "${COMBINED}")
+			string(APPEND TEST_NAME "_HASH_${OPTIONS_HASH}")
+
+			set(FLAG_VAR ${TEST_NAME})
+			set(CMAKE_REQUIRED_LINK_OPTIONS ${LINK_OPTIONS})
+			string(REPLACE ";" " " CLI_COMPILE_OPTIONS "${COMPILE_OPTIONS}")
+
+			if(NBL_COMPILER_LANGUAGE STREQUAL C)
+				check_c_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}")
+			elseif(NBL_COMPILER_LANGUAGE STREQUAL CXX)
+				check_cxx_compiler_flag("${CLI_COMPILE_OPTIONS}" "${FLAG_VAR}")
+			endif()
+
+			if(NOT ${FLAG_VAR})
+				if(NOT "${NBL_COMPILER_FLAG_OUTPUT}" STREQUAL "")
+					file(WRITE "${NBL_OUTPUT_FILE}" "${NBL_COMPILER_FLAG_OUTPUT}") # lock into file, do not cache, must read from the file because of NBL_COMPILER_FLAG_OUTPUT availability (CMake module writes an output only once before a signature flag status is created)
+				endif()
+
+				if(EXISTS "${NBL_OUTPUT_FILE}")
+					file(READ "${NBL_OUTPUT_FILE}" NBL_DIAGNOSTICS)
+					set(NBL_DIAGNOSTICS "Diagnostics:\n${NBL_DIAGNOSTICS}")
+				else()
+					set(NBL_DIAGNOSTICS)
+				endif()
+
+				if(NOT DEFINED NBL_SKIP_BUILD_OPTIONS_VALIDATION)
+					message(FATAL_ERROR "${TEST_NAME} failed! To skip the validation define \"NBL_SKIP_BUILD_OPTIONS_VALIDATION\". ${NBL_DIAGNOSTICS}")
+				endif()
+			endif()
+		endforeach()
+	endblock()
+endforeach()
 
 function(NBL_EXT_P_APPEND_COMPILE_OPTIONS NBL_LIST_NAME MAP_RELEASE MAP_RELWITHDEBINFO MAP_DEBUG)		
 	macro(NBL_MAP_CONFIGURATION NBL_CONFIG_FROM NBL_CONFIG_TO)
@@ -153,37 +281,34 @@ function(nbl_adjust_flags)
 
 			# global compile options
 			list(APPEND _D_NBL_COMPILE_OPTIONS_ ${NBL_COMPILE_OPTIONS})
-			
-			# per configuration compile options with mapping
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:Debug>:${NBL_${NBL_MAP_DEBUG_ITEM_U}_COMPILE_OPTIONS}>)
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:Release>:${NBL_${NBL_MAP_RELEASE_ITEM_U}_COMPILE_OPTIONS}>)
-			list(APPEND _D_NBL_COMPILE_OPTIONS_ $<$<CONFIG:RelWithDebInfo>:${NBL_${NBL_MAP_RELWITHDEBINFO_ITEM_U}_COMPILE_OPTIONS}>)
-			
-			# configuration mapping properties
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:Debug>:${NBL_MAP_DEBUG_ITEM_U}>)
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:Release>:${NBL_MAP_RELEASE_ITEM_U}>)
-			string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:RelWithDebInfo>:${NBL_MAP_RELWITHDEBINFO_ITEM_U}>)
+
+			foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+				string(TOUPPER "${CONFIG}" CONFIG_U)
+
+				# per configuration options with mapping
+				foreach(WHAT COMPILE LINK)
+					list(APPEND _D_NBL_${WHAT}_OPTIONS_ $<$<CONFIG:${CONFIG}>:${NBL_${NBL_MAP_${CONFIG_U}_ITEM_U}_${WHAT}_OPTIONS}>)
+				endforeach()
+
+				# configuration mapping properties
+				string(APPEND _D_NBL_CONFIGURATION_MAP_ $<$<CONFIG:${CONFIG}>:${NBL_MAP_${CONFIG_U}_ITEM_U}>)
+			endforeach()
 			
 			set_target_properties(${NBL_TARGET_ITEM} PROPERTIES
 				NBL_CONFIGURATION_MAP "${_D_NBL_CONFIGURATION_MAP_}"
 				COMPILE_OPTIONS "${_D_NBL_COMPILE_OPTIONS_}"
+				LINK_OPTIONS "${_D_NBL_LINK_OPTIONS_}"
 			)
 			unset(_D_NBL_CONFIGURATION_MAP_)
 			unset(_D_NBL_COMPILE_OPTIONS_)
+			unset(_D_NBL_LINK_OPTIONS_)
 			
 			set(MAPPED_CONFIG $<TARGET_GENEX_EVAL:${NBL_TARGET_ITEM},$<TARGET_PROPERTY:${NBL_TARGET_ITEM},NBL_CONFIGURATION_MAP>>)
 			
-			if(MSVC)
-				if(NBL_SANITIZE_ADDRESS)
-					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase>")
-				else()
-					set(NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT "$<$<STREQUAL:${MAPPED_CONFIG},DEBUG>:EditAndContinue>$<$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>:ProgramDatabase>")
-				endif()	
-			endif()
-			
 			set_target_properties(${NBL_TARGET_ITEM} PROPERTIES
-				MSVC_DEBUG_INFORMATION_FORMAT "${NBL_TARGET_MSVC_DEBUG_INFORMATION_FORMAT}"
-			)			
+				MSVC_DEBUG_INFORMATION_FORMAT $<$<OR:$<STREQUAL:${MAPPED_CONFIG},DEBUG>,$<STREQUAL:${MAPPED_CONFIG},RELWITHDEBINFO>>:ProgramDatabase> # ignored on non xMSVC-ABI targets
+			)
+
 			math(EXPR _NBL_ARG_I_ "${_NBL_ARG_I_} + 1")
 		endwhile()		
 	else() # DIRECTORY mode
diff --git a/cmake/adjust/template/vendor/CXX_Clang.cmake b/cmake/adjust/template/vendor/CXX_Clang.cmake
new file mode 100644
index 0000000000..2cc877c028
--- /dev/null
+++ b/cmake/adjust/template/vendor/CXX_Clang.cmake
@@ -0,0 +1,5 @@
+include_guard(GLOBAL)
+
+set(LANG CXX)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake")
+# append unique CXX options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/CXX_MSVC.cmake b/cmake/adjust/template/vendor/CXX_MSVC.cmake
new file mode 100644
index 0000000000..59f4e59cdd
--- /dev/null
+++ b/cmake/adjust/template/vendor/CXX_MSVC.cmake
@@ -0,0 +1,5 @@
+include_guard(GLOBAL)
+
+set(LANG CXX)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake")
+# append unique CXX options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/C_Clang.cmake b/cmake/adjust/template/vendor/C_Clang.cmake
new file mode 100644
index 0000000000..046ccaa902
--- /dev/null
+++ b/cmake/adjust/template/vendor/C_Clang.cmake
@@ -0,0 +1,5 @@
+include_guard(GLOBAL)
+
+set(LANG C)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/Clang.cmake")
+# append unique C options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/C_MSVC.cmake b/cmake/adjust/template/vendor/C_MSVC.cmake
new file mode 100644
index 0000000000..f9aca4a5b7
--- /dev/null
+++ b/cmake/adjust/template/vendor/C_MSVC.cmake
@@ -0,0 +1,5 @@
+include_guard(GLOBAL)
+
+set(LANG C)
+include("${CMAKE_CURRENT_LIST_DIR}/impl/MSVC.cmake")
+# append unique C options here
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/Clang.cmake b/cmake/adjust/template/vendor/impl/Clang.cmake
new file mode 100644
index 0000000000..0b00294411
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/Clang.cmake
@@ -0,0 +1,109 @@
+include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake")
+
+# vendor template with options fitting for both C and CXX LANGs
+
+if(NOT DEFINED LANG)
+	message(FATAL_ERROR "LANG must be defined!")
+endif()
+
+if(NBL_WITH_COMPILER_CRASH_DIAGNOSTICS)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		# use it to make a repro and attach to an issue if you Clang crashes
+		# - it outputs preprocessed cpp files with sh script for compilation
+		-fcrash-diagnostics=compiler
+		-fcrash-diagnostics-dir=${NBL_ROOT_PATH_BINARY}/.crash-report
+	)
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+	-Xclang=-fconstexpr-backtrace-limit=696969
+	-Xclang=-fconstexpr-depth=696969
+	-Xclang=-fconstexpr-steps=696969
+	-Xclang=-ftemplate-backtrace-limit=0 # no limit
+	-Xclang=-ftemplate-depth=696969
+	-Xclang=-fmacro-backtrace-limit=0 # no limit
+	-Xclang=-fspell-checking-limit=0 # no limit
+	-Xclang=-fcaret-diagnostics-max-lines=0 # no limit
+
+	# latest Clang(CL) 19.1.1 shipped with VS seems to require explicitly features to be listed (simdjson)
+	# TODO: Yas, we should first do independent check if host has the flags, if the request fail then 
+	# do not promote simdjson to build with HASWELL implementation because those flags + avx2 compose 
+	# subset it wants in this case
+
+	################
+	# TODO: (****) ->
+	-mbmi # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mbmi
+	-mlzcnt # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mlzcnt
+	-mpclmul # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mpclmul
+	################ <-
+
+	-Wextra # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
+	-maes # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-maes
+	-mfpmath=sse # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mfpmath
+
+	# TODO: Yas, eliminate all below
+	-fno-strict-aliasing
+	-Wno-sequence-point
+	-Wno-c++98-compat
+	-Wno-c++98-compat-pedantic
+	-Wno-padded
+	-Wno-unsafe-buffer-usage
+	-Wno-switch-enum
+	-Wno-error=ignored-attributes
+	-Wno-unused-parameter
+	-Wno-unused-but-set-parameter
+	-Wno-error=unused-function
+	-Wno-error=unused-variable
+	-Wno-error=unused-parameter
+	-Wno-error=ignored-attributes
+	-Wno-error=non-pod-varargs
+)
+
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		-msse4.2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang1-msse4.2
+) # TODO: (****) optional but then adjust 3rdparty options on fail
+endif()
+
+if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES MSVC)
+	# ClangCL with MSVC frontend (most of the options are compatible but eg /arch:SSE4.2 seems to be not)
+	include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake")
+	return()
+else()
+	if(NBL_REQUEST_SSE_AVX2)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+			-mavx2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mavx2
+	) # TODO: (****)
+	endif()
+
+	if(NBL_SANITIZE_ADDRESS)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=address)
+	endif()
+
+	if(NBL_SANITIZE_THREAD)
+		NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS -fsanitize=thread)
+	endif()
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+		-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+		-mincremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-Wall # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-W-warning
+		-gline-tables-only # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-gline-tables-only
+		-Xclang=-fno-inline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
+		-O2 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-DNDEBUG
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
+		-g # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-g
+		-O1 # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-O-arg
+		-Xclang=-finline-functions # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-finline-functions
+		-mno-incremental-linker-compatible # https://clang.llvm.org/docs/ClangCommandLineReference.html#cmdoption-clang-mincremental-linker-compatible
+		-DNDEBUG
+	)
+endif()
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/MSVC.cmake b/cmake/adjust/template/vendor/impl/MSVC.cmake
new file mode 100644
index 0000000000..803adb1754
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/MSVC.cmake
@@ -0,0 +1,10 @@
+include("${CMAKE_CURRENT_LIST_DIR}/reset.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/frontend/MSVC.cmake")
+
+# vendor template with options fitting for both C and CXX LANGs
+
+if(NBL_REQUEST_SSE_4_2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		/arch:SSE4.2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
+endif()
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake
new file mode 100644
index 0000000000..06ab606104
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/frontend/MSVC.cmake
@@ -0,0 +1,68 @@
+# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_FRONTEND_VARIANT.html#variable:CMAKE_%3CLANG%3E_COMPILER_FRONTEND_VARIANT
+# vendor frontend template with options fitting for both C and CXX LANGs
+
+if(NOT DEFINED LANG)
+	message(FATAL_ERROR "LANG must be defined!")
+endif()
+
+if(NBL_REQUEST_SSE_AVX2)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS
+		/arch:AVX2 # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
+) # TODO: (****) should be (?) optional but then adjust 3rdparty options on fail
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
+	/Zc:preprocessor # https://learn.microsoft.com/en-us/cpp/build/reference/zc-preprocessor?view=msvc-170
+	/Zc:__cplusplus # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus?view=msvc-170
+	/Zc:wchar_t # https://learn.microsoft.com/en-us/cpp/build/reference/zc-wchar-t-wchar-t-is-native-type?view=msvc-170
+	/fp:fast # https://learn.microsoft.com/en-us/cpp/build/reference/fp-specify-floating-point-behavior?view=msvc-170
+	/MP${_NBL_JOBS_AMOUNT_} # https://learn.microsoft.com/en-us/cpp/build/reference/mp-build-with-multiple-processes?view=msvc-170
+)
+
+if(NBL_SANITIZE_ADDRESS)
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} COMPILE_OPTIONS 
+		/fsanitize=address # https://learn.microsoft.com/en-us/cpp/build/reference/fsanitize?view=msvc-170
+	)
+
+	NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+		/RTC1 # https://learn.microsoft.com/en-us/cpp/build/reference/rtc-run-time-error-checks?view=msvc-170
+	)
+endif()
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG DEBUG COMPILE_OPTIONS
+	/Ob0 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/Od # https://learn.microsoft.com/en-us/cpp/build/reference/od-disable-debug?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+)
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELEASE COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob2 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy- # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+	/GF # https://learn.microsoft.com/en-us/cpp/build/reference/gf-eliminate-duplicate-strings?view=msvc-170
+	/GS- # https://learn.microsoft.com/en-us/cpp/build/reference/gs-buffer-security-check?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170
+		/LTCG # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170 (note: /GL implies fallback with LTCG)
+)
+
+NBL_REQUEST_COMPILE_OPTION_SUPPORT(LANG ${LANG} CONFIG RELWITHDEBINFO COMPILE_OPTIONS
+	/O2 # https://learn.microsoft.com/en-us/cpp/build/reference/o1-o2-minimize-size-maximize-speed?view=msvc-170
+	/Ob1 # https://learn.microsoft.com/en-us/cpp/build/reference/ob-inline-function-expansion?view=msvc-170
+	/Oy- # https://learn.microsoft.com/en-us/cpp/build/reference/oy-frame-pointer-omission?view=msvc-170
+	/DNDEBUG # https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/assert-macro-assert-wassert?view=msvc-170
+	/GL # https://learn.microsoft.com/en-us/cpp/build/reference/gl-whole-program-optimization?view=msvc-170
+	/Gy # https://learn.microsoft.com/en-us/cpp/build/reference/gy-enable-function-level-linking?view=msvc-170
+	/sdl- # https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170
+
+	LINK_OPTIONS
+		/INCREMENTAL:NO # https://learn.microsoft.com/en-us/cpp/build/reference/incremental-link-incrementally?view=msvc-170 (note: cannot use /INCREMENTAL with /LTCG:incremental, would cause fallback)
+		/LTCG:incremental # https://learn.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=msvc-170
+)
\ No newline at end of file
diff --git a/cmake/adjust/template/vendor/impl/reset.cmake b/cmake/adjust/template/vendor/impl/reset.cmake
new file mode 100644
index 0000000000..fc1230f326
--- /dev/null
+++ b/cmake/adjust/template/vendor/impl/reset.cmake
@@ -0,0 +1,10 @@
+# init profiles vars by resetting required lists
+
+foreach(LANG CXX C)
+    foreach(WHAT COMPILE LINK DEFINITIONS)
+        set(NBL_${LANG}_${WHAT}_OPTIONS "")
+        foreach(CONFIG RELEASE RELWITHDEBINFO DEBUG)
+            set(NBL_${LANG}_${CONFIG}_${WHAT}_OPTIONS "")
+        endforeach()
+    endforeach()
+endforeach()
\ No newline at end of file
diff --git a/cmake/adjust/template/windows/msvc.cmake b/cmake/adjust/template/windows/msvc.cmake
deleted file mode 100644
index 0f9fe365ee..0000000000
--- a/cmake/adjust/template/windows/msvc.cmake
+++ /dev/null
@@ -1,75 +0,0 @@
-include_guard(GLOBAL)
-
-# https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
-
-# The default instruction set is SSE2 if no /arch option is specified.
-if(NBL_REQUEST_SSE_4_2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:SSE4.2")
-endif()
-
-# Enables Intel Advanced Vector Extensions 2.
-if(NBL_REQUEST_SSE_AXV2)
-	NBL_REQUEST_COMPILE_OPTION_SUPPORT("/arch:AVX2")
-endif()
-
-NBL_REQUEST_COMPILE_OPTION_SUPPORT(/Zc:preprocessor)
-
-# Debug
-set(NBL_C_DEBUG_COMPILE_OPTIONS
-	/Ob0 /Od /MP${_NBL_JOBS_AMOUNT_} /fp:fast /Zc:wchar_t /INCREMENTAL
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_C_DEBUG_COMPILE_OPTIONS /RTC1)
-endif()
-
-set(NBL_CXX_DEBUG_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_DEBUG_COMPILE_OPTIONS}
-)
-
-set(NBL_DEBUG_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_DEBUG_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_DEBUG_COMPILE_OPTIONS}>
-)
-
-# Release
-set(NBL_C_RELEASE_COMPILE_OPTIONS
-	/O2 /Ob2 /DNDEBUG /GL /MP${_NBL_JOBS_AMOUNT_} /Gy- /Zc:wchar_t /sdl- /GF /GS- /fp:fast
-)
-set(NBL_CXX_RELEASE_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_RELEASE_COMPILE_OPTIONS}
-)
-
-set(NBL_RELEASE_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_RELEASE_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_RELEASE_COMPILE_OPTIONS}>
-)
-
-# RelWithDebInfo
-set(NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS
-	/O2 /Ob1 /DNDEBUG /GL /Zc:wchar_t /MP${_NBL_JOBS_AMOUNT_} /Gy /sdl- /Oy- /fp:fast
-)
-set(NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS
-	/Zc:__cplusplus ${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}
-)
-
-set(NBL_RELWITHDEBINFO_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_RELWITHDEBINFO_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_RELWITHDEBINFO_COMPILE_OPTIONS}>
-)
-
-if(NBL_SANITIZE_ADDRESS)
-	list(APPEND NBL_C_COMPILE_OPTIONS /fsanitize=address)
-	list(APPEND NBL_CXX_COMPILE_OPTIONS ${NBL_C_COMPILE_OPTIONS})
-endif()
-
-set(NBL_COMPILE_OPTIONS
-	$<$<COMPILE_LANGUAGE:CXX>:${NBL_CXX_COMPILE_OPTIONS}>
-	$<$<COMPILE_LANGUAGE:C>:${NBL_C_COMPILE_OPTIONS}>
-)
-
-# this should also be not part of profile, pasting from old flags-set function temporary
-# TODO: use profile
-
-#reason for INCREMENTAL:NO: https://docs.microsoft.com/en-us/cpp/build/reference/ltcg-link-time-code-generation?view=vs-2019 /LTCG is not valid for use with /INCREMENTAL.
-set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /INCREMENTAL:NO /LTCG:incremental")
\ No newline at end of file
diff --git a/cmake/common.cmake b/cmake/common.cmake
index f0d3e27f36..69a0a5b980 100755
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -25,7 +25,7 @@ function(nbl_handle_dll_definitions _TARGET_ _SCOPE_)
 		message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!")
 	endif()
 
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
+	if(NBL_COMPILER_DYNAMIC_RUNTIME)
 		set(_NABLA_OUTPUT_DIR_ "${NBL_ROOT_PATH_BINARY}/src/nbl/$<CONFIG>/devshgraphicsprogramming.nabla")
 		
 		target_compile_definitions(${_TARGET_} ${_SCOPE_} 
@@ -43,11 +43,7 @@ function(nbl_handle_runtime_lib_properties _TARGET_)
 		message(FATAL_ERROR "Internal error, requsted \"${_TARGET_}\" is not defined!")
 	endif()
 
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
-		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-	else()
-		set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-	endif()
+	set_target_properties(${_TARGET_} PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 endfunction()
 
 # Macro creating project for an executable
@@ -73,14 +69,6 @@ macro(nbl_create_executable_project _EXTRA_SOURCES _EXTRA_OPTIONS _EXTRA_INCLUDE
 		
 		add_executable(${EXECUTABLE_NAME} ${NBL_EXECUTABLE_SOURCES})
 		nbl_handle_runtime_lib_properties(${EXECUTABLE_NAME})
-		
-		if(WIN32 AND MSVC)
-			if(NBL_DYNAMIC_MSVC_RUNTIME)
-				target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:$<TARGET_FILE_NAME:Nabla>")
-			endif()
-			
-			target_link_options(${EXECUTABLE_NAME} PUBLIC "/DELAYLOAD:dxcompiler.dll")
-		endif()
 	endif()
 	
 	nbl_handle_dll_definitions(${EXECUTABLE_NAME} PUBLIC)
diff --git a/cmake/submodules/update.cmake b/cmake/submodules/update.cmake
index 76e3603980..412cdf04e0 100644
--- a/cmake/submodules/update.cmake
+++ b/cmake/submodules/update.cmake
@@ -1,208 +1,91 @@
-include(ProcessorCount)
 find_package(Git REQUIRED)
 
-option(NBL_UPDATE_GIT_SUBMODULE "Turn this ON to let CMake update all public submodules for you" ON)
-option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "Submodules will be updated with --force flag if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, use with caution - if there are any uncommited files in submodules' working tree they will be removed!" OFF)
-option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync initialized submodule paths if NBL_FORCE_UPDATE_GIT_SUBMODULE is turned ON, this is useful when any submodule remote path got modified and you want to apply this modification to your local repository. Turning NBL_FORCE_ON_UPDATE_GIT_SUBMODULE implies this option" OFF)
-option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "Turn this ON to attempt to update private Nabla submodules" OFF)
-option(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL "Turn this ON to prevent CMake from executing git submodules update or sync in a separate shell - be aware that the interaction with shell will be impossible in case of paraphrase prompt request of your key!" ON)
-option(NBL_CI_GIT_SUBMODULES_SHALLOW "" OFF)
+option(NBL_UPDATE_GIT_SUBMODULE "Turn ON to update submodules, only public by default" ON)
+option(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE "NBL_UPDATE_GIT_SUBMODULE logic with --force flag" OFF)
+option(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE "Sync submodule URLs" OFF)
+option(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE "NBL_UPDATE_GIT_SUBMODULE logic but includes private submodules, for Nabla devs" OFF)
+option(NBL_SUBMODULES_SHALLOW "NBL_UPDATE_GIT_SUBMODULE logic with --depth=1" OFF)
 
-if(NOT DEFINED NBL_ROOT_PATH)
+if(NBL_UPDATE_GIT_SUBMODULE)
+block()
 	get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
-endif()
-
-if(NOT DEFINED THIRD_PARTY_SOURCE_DIR)
 	set(THIRD_PARTY_SOURCE_DIR "${NBL_ROOT_PATH}/3rdparty")
-endif()
 
-if(NOT DEFINED NBL_ROOT_PATH_BINARY)
-	set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules")
-endif()
+	if(NOT DEFINED NBL_ROOT_PATH_BINARY)
+		set(NBL_ROOT_PATH_BINARY "${NBL_ROOT_PATH}/build/.submodules")
+	endif()
 
-if(NOT DEFINED NBL_BUILD_EXAMPLES)
-	set(NBL_BUILD_EXAMPLES ON)
-endif()
+	if(NOT DEFINED NBL_BUILD_EXAMPLES)
+		set(NBL_BUILD_EXAMPLES ON)
+	endif()
 
-function(NBL_UPDATE_SUBMODULES)
-	ProcessorCount(_GIT_SUBMODULES_JOBS_AMOUNT_)
-	
-	if(NBL_CI_GIT_SUBMODULES_SHALLOW)
-		set(NBL_SHALLOW "--depth=1")
-	else()
-		set(NBL_SHALLOW "")
+	# we force HTTPS traffic for all *public* submodules we update from CMake
+	# NOTE: it *doesn't* rewrite destination URLs after checkout, if you eg. 
+	# clone with SSH you end up with it anyway, this way your private key 
+	# is never involved during CMake configuration, unless you
+	# use NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE
+
+	# Private refs (*), exclude from public update
+	list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"Ditt-Reference-Scenes\".update=none)
+
+	unset(NBL_UPDATE_OPTIONS)
+
+	if(NBL_SUBMODULES_SHALLOW)
+		list(APPEND NBL_UPDATE_OPTIONS --depth=1)
 	endif()
-	
+
 	if(NBL_FORCE_ON_UPDATE_GIT_SUBMODULE)
-		set(NBL_FORCE "--force")
-	else()
-		set(NBL_FORCE "")
+		list(APPEND NBL_UPDATE_OPTIONS --force)
+	endif()
+
+	if(NOT NBL_BUILD_EXAMPLES)
+		list(APPEND NBL_CONFIG_SUBMODULE -c submodule.\"examples_tests\".update=none)
 	endif()
 
-	macro(NBL_WRAPPER_COMMAND_EXCLUSIVE GIT_RELATIVE_ENTRY GIT_SUBMODULE_PATH SHOULD_RECURSIVE EXCLUDE_SUBMODULE_PATHS)
-		set(EXCLUDE_SUBMODULE_PATHS ${EXCLUDE_SUBMODULE_PATHS})
-		set(SHOULD_RECURSIVE ${SHOULD_RECURSIVE})
-		
-		if("${EXCLUDE_SUBMODULE_PATHS}" STREQUAL "")
-			set(NBL_EXCLUDE "")
-		else()
-			foreach(EXCLUDE_SUBMODULE_PATH ${EXCLUDE_SUBMODULE_PATHS})
-				string(APPEND NBL_EXCLUDE "-c submodule.\"${EXCLUDE_SUBMODULE_PATH}\".update=none ")
-			endforeach()
-			
-			string(STRIP "${NBL_EXCLUDE}" NBL_EXCLUDE)
-		endif()
-
-		if(SHOULD_RECURSIVE)
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} --recursive ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
-		else()
-			set(_NBL_EXECUTE_COMMAND_ "\"${GIT_EXECUTABLE}\" -C \"${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}\" ${NBL_EXCLUDE} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} ${NBL_FORCE} ${NBL_SHALLOW} ${GIT_SUBMODULE_PATH}")
-		endif()
-		
-		string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "${_NBL_EXECUTE_COMMAND_}\n")
-		
-		unset(NBL_EXCLUDE)
+	macro(NBL_GIT_COMMAND)
+		execute_process(COMMAND "${GIT_EXECUTABLE}" ${ARGV})
 	endmacro()
-	
-	set(_NBL_UPDATE_SUBMODULES_CMD_NAME_ "nbl-update-submodules")
-	set(_NBL_UPDATE_SUBMODULES_CMD_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmd")
-	get_filename_component(_NBL_UPDATE_IMPL_CMAKE_FILE_ "${NBL_ROOT_PATH_BINARY}/${_NBL_UPDATE_SUBMODULES_CMD_NAME_}.cmake" ABSOLUTE)
-	
-	# Proxy script for inclusive submodule updating
-	string(APPEND NBL_IMPL_SCRIPT "set(NBL_ROOT_PATH \"${NBL_ROOT_PATH}\")\nset(_GIT_SUBMODULES_JOBS_AMOUNT_ ${_GIT_SUBMODULES_JOBS_AMOUNT_})\nset(GIT_EXECUTABLE \"${GIT_EXECUTABLE}\")\nset(NBL_SHALLOW \"${NBL_SHALLOW}\")\nset(NBL_FORCE \"${NBL_FORCE}\")\n\n")
-	string(APPEND NBL_IMPL_SCRIPT
-[=[
-if(NOT DEFINED GIT_RELATIVE_ENTRY)
-	message(FATAL_ERROR "GIT_RELATIVE_ENTRY must be defined to use this script!")
-endif()
 
-if(NOT DEFINED INCLUDE_SUBMODULE_PATHS)
-	message(FATAL_ERROR "INCLUDE_SUBMODULE_PATHS must be defined to use this script!")
-endif()
+	if(NBL_SYNC_ON_UPDATE_GIT_SUBMODULE)
+		message(STATUS "Syncing Public submodules")
+		NBL_GIT_COMMAND(${NBL_CONFIG_SUBMODULE} submodule sync --recursive WORKING_DIRECTORY "${NBL_ROOT_PATH}")
+	endif()
+	
+	message(STATUS "Updating Public submodules")
+	NBL_GIT_COMMAND(-c fetch.parallel=0 -c url.https://github.com/.insteadOf=git@github.com: ${NBL_CONFIG_SUBMODULE} submodule update --init --recursive ${NBL_UPDATE_OPTIONS} WORKING_DIRECTORY "${NBL_ROOT_PATH}")
 
-# update an inclusive submodule first
-execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}" submodule update --init "${GIT_RELATIVE_ENTRY}")
+	if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
+		# NOTE: your git must be installed with default Git Bash as shell 
+		# otherwise it *may* fail, whether it works depends on your agent setup
 
-if("${INCLUDE_SUBMODULE_PATHS}" STREQUAL "")
-	set(NBL_SUBMODULE_UPDATE_CONFIG_ENTRY "")
-else()
-	execute_process(COMMAND "${GIT_EXECUTABLE}" -C "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}" config --file .gitmodules --get-regexp path
-		OUTPUT_VARIABLE NBL_OUTPUT_VARIABLE
-	)
+		find_package(GitBash REQUIRED)
 
-	string(REGEX REPLACE "\n" ";" NBL_SUBMODULE_CONFIG_LIST "${NBL_OUTPUT_VARIABLE}")
-	
-	foreach(NBL_SUBMODULE_NAME ${NBL_SUBMODULE_CONFIG_LIST})
-		string(REGEX MATCH "submodule\\.(.*)\\.path" NBL_SUBMODULE_NAME "${NBL_SUBMODULE_NAME}")
-		list(APPEND NBL_ALL_SUBMODULES "${CMAKE_MATCH_1}")
-	endforeach()
-	
-	foreach(NBL_SUBMODULE_NAME ${NBL_ALL_SUBMODULES})		
-		list(FIND INCLUDE_SUBMODULE_PATHS "${NBL_SUBMODULE_NAME}" NBL_FOUND)
-		
-		if("${NBL_FOUND}" STREQUAL "-1")
-			list(APPEND NBL_CONFIG_SETUP_CMD "-c;submodule.${NBL_SUBMODULE_NAME}.update=none") # filter submodules - only those on the INCLUDE_SUBMODULE_PATHS list will be updated when recursive update is requested, all left will be skipped
-		endif()
-	endforeach()
-endif()
-
-execute_process(COMMAND "${GIT_EXECUTABLE}" ${NBL_CONFIG_SETUP_CMD} submodule update --init -j ${_GIT_SUBMODULES_JOBS_AMOUNT_} --recursive ${NBL_SHALLOW} ${NBL_FORCE}
-	WORKING_DIRECTORY "${NBL_ROOT_PATH}/${GIT_RELATIVE_ENTRY}"
-)
-]=]
-)
-	file(WRITE "${_NBL_UPDATE_IMPL_CMAKE_FILE_}" "${NBL_IMPL_SCRIPT}")
-	
-	macro(NBL_WRAPPER_COMMAND_INCLUSIVE GIT_RELATIVE_ENTRY INCLUDE_SUBMODULE_PATHS)
-		string(APPEND _NBL_UPDATE_SUBMODULES_COMMANDS_ "\"${CMAKE_COMMAND}\" \"-DGIT_RELATIVE_ENTRY=${GIT_RELATIVE_ENTRY}\" \"-DINCLUDE_SUBMODULE_PATHS=${INCLUDE_SUBMODULE_PATHS}\" -P \"${_NBL_UPDATE_IMPL_CMAKE_FILE_}\"\n")
-	endmacro()
-	
-	if(NBL_UPDATE_GIT_SUBMODULE)
-		execute_process(COMMAND ${CMAKE_COMMAND} -E echo "All submodules are about to get updated and initialized in repository because NBL_UPDATE_GIT_SUBMODULE is turned ON!")
-		
-		include("${THIRD_PARTY_SOURCE_DIR}/boost/dep/wave.cmake")
-		
-		macro(NBL_IMPL_INIT_COMMON_SUBMODULES)
-			# 3rdparty except boost & gltf
-			set(NBL_3RDPARTY_MODULES_TO_SKIP
-				3rdparty/boost/superproject # a lot of submodules we don't use
-				3rdparty/glTFSampleModels # more then 2GB waste of space (disk + .gitmodules data)
-			)
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./3rdparty TRUE "${NBL_3RDPARTY_MODULES_TO_SKIP}")
-			
-			# boost's 3rdparties, special case
-			set(NBL_BOOST_LIBS_TO_INIT ${NBL_BOOST_LIBS} wave numeric_conversion) # wave and all of its deps, numeric_conversion is nested in conversion submodule (for some reason boostdep tool doesn't output it properly)
-			foreach(NBL_TARGET ${NBL_BOOST_LIBS_TO_INIT})
-				list(APPEND NBL_BOOST_SUBMODULES_TO_INIT ${NBL_TARGET})
-			endforeach()
-			NBL_WRAPPER_COMMAND_INCLUSIVE(3rdparty/boost/superproject "${NBL_BOOST_SUBMODULES_TO_INIT}")
-			
-			# tests
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./tests FALSE "")
-			
-			# docker
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./docker FALSE "")
+		macro(NBL_GIT_BASH_COMMAND)
+			execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c" ${ARGV})
 		endmacro()
-		
-		NBL_IMPL_INIT_COMMON_SUBMODULES()
-		
-		if(NBL_UPDATE_GIT_SUBMODULE_INCLUDE_PRIVATE)
-			NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests TRUE "")
-		else()
-			# NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./ci TRUE "") TODO: enable it once we merge Ditt, etc
-			
-			# examples and their media
-			if(NBL_BUILD_EXAMPLES)
-				NBL_WRAPPER_COMMAND_EXCLUSIVE("" ./examples_tests FALSE "")
-				NBL_WRAPPER_COMMAND_EXCLUSIVE(examples_tests ./media FALSE "")
-			endif()
-		endif()
-				
-		file(WRITE "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}" "${_NBL_UPDATE_SUBMODULES_COMMANDS_}")
-
-		if(WIN32)
-			if(NBL_UPDATE_GIT_SUBMODULE_NO_SEPARATE_SHELL)
-				set(UPDATE_COMMAND
-					nbl-update-submodules.cmd
-				)
-			
-				execute_process(COMMAND ${UPDATE_COMMAND}
-					WORKING_DIRECTORY "${NBL_ROOT_PATH_BINARY}"
-					RESULT_VARIABLE _NBL_TMP_RET_CODE_
-				)
-			else()
-				find_package(GitBash REQUIRED)
-		
-				execute_process(COMMAND "${GIT_BASH_EXECUTABLE}" "-c"
+
+		message(STATUS "Updating Private submodules")
+		string(REPLACE ";" " " NBL_UPDATE_OPTIONS "${NBL_UPDATE_OPTIONS}")
+		set(LOG_FILE "${NBL_ROOT_PATH_BINARY}/nbl-update-private-submodules.log")
+		set(BASH_CMD
 [=[
 >&2 echo ""
 clear
-./nbl-update-submodules.cmd 2>&1 | tee nbl-update-submodules.log
-sleep 1
+{	
+	echo "=== $(date) :: Starting private submodule update ==="
+	git -c submodule.Ditt-Reference-Scenes.update=checkout -C @NBL_ROOT_PATH@/examples_tests/media submodule update --init Ditt-Reference-Scenes @NBL_UPDATE_OPTIONS@
+	# more private submodule here
+
+	echo "=== $(date) :: Created @LOG_FILE@ in your build directory. ==="
+	echo "=== $(date) :: Finished private submodule update ==="
+} 2>&1 | tee @LOG_FILE@
 clear
-tput setaf 2; echo -e "Submodules have been updated! 
-Created nbl-update-submodules.log in your build directory."
 ]=]
-					WORKING_DIRECTORY ${NBL_ROOT_PATH_BINARY}
-					OUTPUT_VARIABLE _NBL_TMP_OUTPUT_
-					RESULT_VARIABLE _NBL_TMP_RET_CODE_
-					OUTPUT_STRIP_TRAILING_WHITESPACE
-					ERROR_STRIP_TRAILING_WHITESPACE
-				)
-				
-				unset(_NBL_TMP_OUTPUT_)
-				unset(_NBL_TMP_RET_CODE_)
-			
-				message(STATUS "Generated \"${NBL_ROOT_PATH_BINARY}/nbl-update-submodules.log\"")
-			endif()
-			
-			message(STATUS "Submodules have been updated!")
-		else()
-			execute_process(COMMAND "${_NBL_UPDATE_SUBMODULES_CMD_FILE_}")
-		endif()
-	else()
-		execute_process(COMMAND ${CMAKE_COMMAND} -E echo "NBL_UPDATE_GIT_SUBMODULE is turned OFF therefore submodules won't get updated.")
+		)
+		string(CONFIGURE "${BASH_CMD}" BASH_CMD)
+		NBL_GIT_BASH_COMMAND("${BASH_CMD}" OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE RESULT_VARIABLE RES)
+		file(READ "${LOG_FILE}" LOG_CONTENT)
+		message(STATUS "${LOG_CONTENT}")
 	endif()
-endfunction()
-
-NBL_UPDATE_SUBMODULES()
\ No newline at end of file
+endblock()
+endif()
\ No newline at end of file
diff --git a/compose.yml b/compose.yml
new file mode 100644
index 0000000000..c80bdb4319
--- /dev/null
+++ b/compose.yml
@@ -0,0 +1,22 @@
+services:
+  nsc:
+    container_name: nsc-godbolt
+    image: ghcr.io/devsh-graphics-programming/nabla-shader-compiler-godbolt:latest
+    isolation: process
+    ports:
+      - "80:10240"
+    volumes:
+      - type: bind
+        source: C:\Windows\Globalization\ICU
+        target: C:\Windows\Globalization\ICU
+        read_only: true
+      - type: bind
+        source: C:\Windows\System32
+        target: C:\mount\Windows\System32
+        read_only: true
+    restart: always
+
+networks:
+  default:
+    external: true
+    name: docker_default
diff --git a/docker/.env b/docker/.env
deleted file mode 100644
index 623184f422..0000000000
--- a/docker/.env
+++ /dev/null
@@ -1,2 +0,0 @@
-THIS_PROJECT_WORKING_DIRECTORY=C:\docker
-THIS_PROJECT_NABLA_DIRECTORY=C:/Users/ContainerAdministrator/Nabla/bind
\ No newline at end of file
diff --git a/docker/ci-windows.env b/docker/ci-windows.env
new file mode 100644
index 0000000000..ea89ce43c7
--- /dev/null
+++ b/docker/ci-windows.env
@@ -0,0 +1,2 @@
+NBL_CI_MODE=ON
+NBL_CI_BUILD_DIRECTORY=C:\mount\nabla\build-ct
\ No newline at end of file
diff --git a/docker/compiler-explorer b/docker/compiler-explorer
index e7d3e6ce85..45866dfa87 160000
--- a/docker/compiler-explorer
+++ b/docker/compiler-explorer
@@ -1 +1 @@
-Subproject commit e7d3e6ce85d4b87bd9afadc5b2ba8c268ccbeb51
+Subproject commit 45866dfa8782404fc121f25ce15ad0626b474db0
diff --git a/docker/msvc-winsdk b/docker/msvc-winsdk
new file mode 160000
index 0000000000..d91a96faed
--- /dev/null
+++ b/docker/msvc-winsdk
@@ -0,0 +1 @@
+Subproject commit d91a96faede2933ec02a18b94141fbed549929c0
diff --git a/docker/ninja.env b/docker/ninja.env
new file mode 100644
index 0000000000..6d52cbd701
--- /dev/null
+++ b/docker/ninja.env
@@ -0,0 +1 @@
+NINJA_STATUS=[%r jobs, %f/%t edges, %oe/s, elapsed %ws]: 
\ No newline at end of file
diff --git a/examples_tests b/examples_tests
index 8c76367c1c..95d8f78465 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8c76367c1c226cce3d66f1c60f540e29a501a1cb
+Subproject commit 95d8f78465e100bb3a926cea412c21891c800b9d
diff --git a/include/nbl/asset/IAccelerationStructure.h b/include/nbl/asset/IAccelerationStructure.h
index d251dd3077..829d10bcd8 100644
--- a/include/nbl/asset/IAccelerationStructure.h
+++ b/include/nbl/asset/IAccelerationStructure.h
@@ -59,11 +59,11 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 		// build flags, we don't expose flags that don't make sense for certain levels
 		enum class BUILD_FLAGS : uint16_t
 		{
-			ALLOW_UPDATE_BIT = base_build_flags_t::ALLOW_UPDATE_BIT,
-			ALLOW_COMPACTION_BIT = base_build_flags_t::ALLOW_COMPACTION_BIT,
-			PREFER_FAST_TRACE_BIT = base_build_flags_t::PREFER_FAST_TRACE_BIT,
-			PREFER_FAST_BUILD_BIT = base_build_flags_t::PREFER_FAST_BUILD_BIT,
-			LOW_MEMORY_BIT = base_build_flags_t::LOW_MEMORY_BIT,
+			ALLOW_UPDATE_BIT = static_cast<uint16_t>(base_build_flags_t::ALLOW_UPDATE_BIT),
+			ALLOW_COMPACTION_BIT = static_cast<uint16_t>(base_build_flags_t::ALLOW_COMPACTION_BIT),
+			PREFER_FAST_TRACE_BIT = static_cast<uint16_t>(base_build_flags_t::PREFER_FAST_TRACE_BIT),
+			PREFER_FAST_BUILD_BIT = static_cast<uint16_t>(base_build_flags_t::PREFER_FAST_BUILD_BIT),
+			LOW_MEMORY_BIT = static_cast<uint16_t>(base_build_flags_t::LOW_MEMORY_BIT),
 			// Synthetic flag we use to indicate that the build data are AABBs instead of triangles, we've taken away the per-geometry choice thanks to:
 			// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildGeometryInfoKHR-type-03792
 			GEOMETRY_TYPE_IS_AABB_BIT = 0x1u<<5u,
@@ -88,42 +88,62 @@ class IBottomLevelAccelerationStructure : public IAccelerationStructure
 			NO_DUPLICATE_ANY_HIT_INVOCATION_BIT	= 0x1u<<1u,
 		};
 
+		enum class GeometryType : uint8_t
+		{
+			Triangles = 0,
+			AABBs = 1,
+			// Later: LSS and friends
+			Count = 2
+		};
+
 		// Note that in Vulkan strides are 64-bit value but restricted to be 32-bit in range
-		template<typename BufferType> requires std::is_base_of_v<IBuffer,BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<IBuffer,BufferType>)
 		struct Triangles
 		{
-			using buffer_t = std::remove_const_t<BufferType>;
-			constexpr static inline bool Host = std::is_same_v<buffer_t,ICPUBuffer>;
-			// we make our life easier by not taking pointers to single matrix values
-			using transform_t = std::conditional_t<Host,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
-
-			inline bool hasTransform() const
-			{
-				if constexpr (Host)
-					return !core::isnan(transform[0][0]);
-				else
-					return bool(transform.buffer);
-			}
-
-			// optional, only useful for baking model transforms of multiple meshes into one BLAS
-			transform_t	transform = {};
-			// vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT`
-			asset::SBufferBinding<const buffer_t>	vertexData[2] = {{},{}};
-			asset::SBufferBinding<const buffer_t>	indexData = {};
-			uint32_t								maxVertex = 0u;
-			// type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819
-			uint32_t								vertexStride = sizeof(float);
-			E_FORMAT								vertexFormat = EF_R32G32B32_SFLOAT;
-			E_INDEX_TYPE							indexType = EIT_UNKNOWN;
-			core::bitflag<GEOMETRY_FLAGS>			geometryFlags = GEOMETRY_FLAGS::NONE;
-			// TODO: opacity and displacement micromap buffers and shizz
+			public:
+				using buffer_t = BufferType;
+				constexpr static inline GeometryType Type = GeometryType::Triangles;
+
+				constexpr static inline bool HostTransform = std::is_same_v<buffer_t,ICPUBuffer>;
+				// we make our life easier by not taking pointers to single matrix values
+				using transform_t = std::conditional_t<HostTransform,hlsl::float32_t3x4,asset::SBufferBinding<const buffer_t>>;
+
+				inline bool hasTransform() const
+				{
+					if constexpr (HostTransform)
+						return !core::isnan(transform[0][0]);
+					else
+						return bool(transform.buffer);
+				}
+
+				// optional, only useful for baking model transforms of multiple meshes into one BLAS
+				transform_t	transform = __transform_initializer();
+				// vertexData[1] are the vertex positions at time 1.0, and only used for AccelerationStructures created with `MOTION_BIT`
+				asset::SBufferBinding<const buffer_t>	vertexData[2] = {{},{}};
+				asset::SBufferBinding<const buffer_t>	indexData = {};
+				uint32_t								maxVertex = 0u;
+				// type implicitly satisfies: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureGeometryTrianglesDataKHR-vertexStride-03819
+				uint32_t								vertexStride = sizeof(float);
+				E_FORMAT								vertexFormat = EF_R32G32B32_SFLOAT;
+				E_INDEX_TYPE							indexType = EIT_UNKNOWN;
+				core::bitflag<GEOMETRY_FLAGS>			geometryFlags = GEOMETRY_FLAGS::NONE;
+				// TODO: opacity and displacement micromap buffers and shizz
+
+			private:
+				constexpr static transform_t __transform_initializer()
+				{
+					if constexpr (HostTransform)
+						return hlsl::float32_t3x4(std::numeric_limits<float>::quiet_NaN());
+					return {};
+				}
 		};
 
 		//
-		template<typename BufferType> requires std::is_base_of_v<IBuffer,BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<IBuffer,BufferType>)
 		struct AABBs
 		{
-			using buffer_t = std::remove_const_t<BufferType>;
+			using buffer_t = BufferType;
+			constexpr static inline GeometryType Type = GeometryType::AABBs;
 
 			// for `MOTION_BIT` you don't get a second buffer for AABBs at different times because linear interpolation of AABBs doesn't work
 			asset::SBufferBinding<const BufferType>	data = {};
diff --git a/include/nbl/asset/IAsset.h b/include/nbl/asset/IAsset.h
index fdb41ed298..aae73fac2a 100644
--- a/include/nbl/asset/IAsset.h
+++ b/include/nbl/asset/IAsset.h
@@ -94,6 +94,7 @@ class IAsset : virtual public core::IReferenceCounted
 			ET_COMPUTE_PIPELINE = 1ull<<20,                     //!< asset::ICPUComputePipeline
 			ET_PIPELINE_CACHE = 1ull<<21,						//!< asset::ICPUPipelineCache
 			ET_SCENE = 1ull<<22,								//!< reserved, to implement later
+			ET_RAYTRACING_PIPELINE = 1ull << 23, //!< asset::ICPURayTracingPipeline
 			ET_IMPLEMENTATION_SPECIFIC_METADATA = 1ull<<31u,    //!< lights, etc.
 			//! Reserved special value used for things like terminating lists of this enum
 
@@ -155,30 +156,37 @@ class IAsset : virtual public core::IReferenceCounted
 		//!
 		inline bool isMutable() const {return m_mutable;}
 
-		//!
-		virtual size_t getDependantCount() const = 0;
-		inline IAsset* getDependant(const size_t ix)
-		{
-			if (ix<getDependantCount())
-				return getDependant_impl(ix);
-			return nullptr;
-		}
-		inline const IAsset* getDependant(const size_t ix) const
-		{
-			IAsset* const retval = const_cast<IAsset*>(this)->getDependant(ix);
-			return retval;
-		}
+    inline void visitDependents(std::function<bool(const IAsset*)> visit) const
+    {
+        visitDependents_impl([&visit](const IAsset* dep)->bool
+        {
+            if (dep)
+                return visit(dep);
+            return true;
+        });
+    }
+
+    inline void visitDependents(std::function<bool(IAsset*)> visit)
+    {
+        assert(isMutable());
+        visitDependents([&](const IAsset* dependent) -> bool
+        {
+            return visit(const_cast<IAsset*>(dependent));
+        });
+    }
+
+		virtual bool valid() const = 0;
 
     protected:
 		inline IAsset() = default;
 		//! Pure virtual destructor to ensure no instantiation
 		NBL_API2 virtual ~IAsset() = 0;
 
-		virtual IAsset* getDependant_impl(const size_t ix) = 0;
-
 	private:
 		friend IAssetManager;
 		bool m_mutable = true;
+
+		virtual void visitDependents_impl(std::function<bool(const IAsset*)> visit) const = 0;
 };
 
 template<typename T>
diff --git a/include/nbl/asset/ICPUAccelerationStructure.h b/include/nbl/asset/ICPUAccelerationStructure.h
index 9c9af32f7b..a6b148a891 100644
--- a/include/nbl/asset/ICPUAccelerationStructure.h
+++ b/include/nbl/asset/ICPUAccelerationStructure.h
@@ -135,12 +135,10 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return cp;
 		}
 
-		// Do not report anything as a dependant, we'll simply drop the data instead of discarding its contents
-		inline size_t getDependantCount() const override {return 0;}
 
 		inline core::blake3_hash_t computeContentHash() const override
 		{
-			if (!missingContent())
+			if (missingContent())
 				return INVALID_HASH;
 			const bool isAABB = m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT);
 			core::blake3_hasher hasher;
@@ -233,11 +231,36 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 			return !m_geometryPrimitiveCount || !m_triangleGeoms && !m_AABBGeoms;
 		}
 
+		inline bool valid() const override
+		{
+			if (!validBuildFlags(m_buildFlags)) return false;
+
+			size_t geometryCount = 0;
+			if (m_buildFlags.hasFlags(BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+			{
+				if (!m_AABBGeoms || m_triangleGeoms) return false;
+				geometryCount = m_AABBGeoms->size();
+			}
+			else
+			{
+				if (!m_triangleGeoms || m_AABBGeoms) return false;
+				geometryCount = m_triangleGeoms->size();
+			}
+
+      // https://registry.khronos.org/vulkan/specs/latest/man/html/vkGetAccelerationStructureBuildSizesKHR.html#VUID-vkGetAccelerationStructureBuildSizesKHR-pBuildInfo-03619
+			if (geometryCount == 0) {
+				if (m_geometryPrimitiveCount && m_geometryPrimitiveCount->size() > 0) return false;
+			}
+		  else
+			{
+				if (!m_geometryPrimitiveCount || m_geometryPrimitiveCount->size() != geometryCount) return false;
+			}
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUBottomLevelAccelerationStructure() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		inline void discardContent_impl() override
 		{
 			m_triangleGeoms = nullptr;
@@ -251,6 +274,8 @@ class ICPUBottomLevelAccelerationStructure final : public IPreHashed, public IBo
 		core::smart_refctd_dynamic_array<AABBs<ICPUBuffer>> m_AABBGeoms = nullptr;
 		core::smart_refctd_dynamic_array<uint32_t> m_geometryPrimitiveCount = nullptr;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_TRACE_BIT;
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
 };
 
 class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelAccelerationStructure
@@ -263,9 +288,6 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 		//
 		ICPUTopLevelAccelerationStructure() = default;
 
-		//
-		inline size_t getDependantCount() const override {return m_instances->size();}
-
 		//
 		inline auto& getBuildRangeInfo()
 		{
@@ -357,18 +379,32 @@ class ICPUTopLevelAccelerationStructure final : public IAsset, public ITopLevelA
 			return cp;
 		}
 
-	protected:
-		virtual ~ICPUTopLevelAccelerationStructure() = default;
-
-		inline IAsset* getDependant_impl(const size_t ix) override
+		inline bool valid() const override
 		{
-			return m_instances->operator[](ix).getBase().blas.get();
+			if (!validBuildFlags(m_buildFlags)) return false;
+			if (!m_instances) return false;
+			for (const auto& instance : *m_instances)
+				if (!instance.getBase().blas->valid()) return false;
+			if (m_buildRangeInfo.instanceCount != m_instances->size()) return false;
+			// https://registry.khronos.org/vulkan/specs/latest/man/html/VkAccelerationStructureBuildRangeInfoKHR.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03660
+			if (m_buildRangeInfo.instanceByteOffset % 16 != 0) return false;
+			return true;
 		}
 
+	protected:
+		virtual ~ICPUTopLevelAccelerationStructure() = default;
+
 	private:
 		core::smart_refctd_dynamic_array<PolymorphicInstance> m_instances = nullptr;
 		hlsl::acceleration_structures::top_level::BuildRangeInfo m_buildRangeInfo;
 		core::bitflag<BUILD_FLAGS> m_buildFlags = BUILD_FLAGS::PREFER_FAST_BUILD_BIT;
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+      if (!m_instances) return;
+      for (const auto& instance : *m_instances)
+        if (!visit(instance.getBase().blas.get())) return;
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUAnimationLibrary.h b/include/nbl/asset/ICPUAnimationLibrary.h
index 1b02787597..321cefa33b 100644
--- a/include/nbl/asset/ICPUAnimationLibrary.h
+++ b/include/nbl/asset/ICPUAnimationLibrary.h
@@ -95,23 +95,16 @@ class ICPUAnimationLibrary final : public IAnimationLibrary<ICPUBuffer>, public
 
 		constexpr static inline auto AssetType = ET_ANIMATION_LIBRARY;
 		inline E_TYPE getAssetType() const override { return AssetType; }
+		inline bool valid() const override { return true; }
 
-		inline size_t getDependantCount() const override {return 3;}
+  private:
 
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			switch (ix)
-			{
-				case 0:
-					return m_keyframeStorageBinding.buffer.get();
-				case 1:
-					return m_timestampStorageBinding.buffer.get();
-				default:
-					break;
-			}
-			return m_animationStorageRange.buffer.get();
-		}
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(m_keyframeStorageBinding.buffer.get())) return;
+        if (!visit(m_timestampStorageBinding.buffer.get())) return;
+        if (!visit(m_animationStorageRange.buffer.get())) return;
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUBuffer.h b/include/nbl/asset/ICPUBuffer.h
index 5bb16bd0ac..46105b3c0e 100644
--- a/include/nbl/asset/ICPUBuffer.h
+++ b/include/nbl/asset/ICPUBuffer.h
@@ -75,8 +75,6 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         constexpr static inline auto AssetType = ET_BUFFER;
         inline IAsset::E_TYPE getAssetType() const override final { return AssetType; }
 
-        inline size_t getDependantCount() const override { return 0; }
-
         inline core::blake3_hash_t computeContentHash() const override
         {
             core::blake3_hasher hasher;
@@ -112,12 +110,15 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
             return true;
         }
 
-protected:
-    inline IAsset* getDependant_impl(const size_t ix) override
-    {
-        return nullptr;
-    }
+        inline bool valid() const override
+        {
+            if (!m_data) return false;
+            if (!m_mem_resource) return false;
+            // check if alignment is power of two
+            return (m_alignment > 0 && !(m_alignment & (m_alignment - 1)));
+        }
 
+protected:
     inline void discardContent_impl() override
     {
         if (m_data)
@@ -136,6 +137,8 @@ class ICPUBuffer final : public asset::IBuffer, public IPreHashed
         discardContent_impl();
     }
 
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override {}
+
     void* m_data;
     core::smart_refctd_ptr<core::refctd_memory_resource> m_mem_resource;
     size_t m_alignment;
diff --git a/include/nbl/asset/ICPUBufferView.h b/include/nbl/asset/ICPUBufferView.h
index 3819136c98..8634fd8394 100644
--- a/include/nbl/asset/ICPUBufferView.h
+++ b/include/nbl/asset/ICPUBufferView.h
@@ -28,8 +28,6 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 		constexpr static inline auto AssetType = ET_BUFFER_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return 1;}
-
 		ICPUBuffer* getUnderlyingBuffer() 
 		{
 			assert(isMutable());
@@ -48,12 +46,24 @@ class ICPUBufferView : public IBufferView<ICPUBuffer>, public IAsset
 			m_size = _size;
 		}
 
+    inline bool valid() const override
+    {
+        if (!m_buffer->valid()) return false;
+        if (m_offset >= m_buffer->getSize()) return false;
+        if (m_size <= 0) return false;
+        if (m_offset >= m_buffer->getSize()) return false;
+        if (m_size > m_buffer->getSize() - m_offset) return false;
+				return true;
+    }
+
 	protected:
 		virtual ~ICPUBufferView() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
+  private:
+
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
-			return m_buffer.get();
+        if (!visit(m_buffer.get())) return;
 		}
 };
 
diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index b9b707d9fc..ffcf78e908 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -6,62 +6,92 @@
 
 
 #include "nbl/asset/ICPUPipeline.h"
+#include "nbl/asset/IComputePipeline.h"
 
 
 namespace nbl::asset
 {
 
 //! CPU Version of Compute Pipeline
-class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
+class ICPUComputePipeline final : public ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>
 {
-        using base_t = ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>;
+        using pipeline_base_t = IComputePipeline<ICPUPipelineLayout>;
+        using base_t = ICPUPipeline<IComputePipeline<ICPUPipelineLayout>>;
 
     public:
-        struct SCreationParams final : IPipeline<ICPUPipelineLayout>::SCreationParams
-        {
-            SShaderSpecInfo shader;
-        };
-        static core::smart_refctd_ptr<ICPUComputePipeline> create(const SCreationParams& params)
+
+        static core::smart_refctd_ptr<ICPUComputePipeline> create(ICPUPipelineLayout* layout)
         {
-            if (!params.layout)
-                return nullptr;
-            auto retval = new ICPUComputePipeline(core::smart_refctd_ptr<const ICPUPipelineLayout>(params.layout));
-            if (!retval->setSpecInfo(params.shader))
-            {
-                retval->drop();
-                return nullptr;
-            }
+            auto retval = new ICPUComputePipeline(layout);
             return core::smart_refctd_ptr<ICPUComputePipeline>(retval,core::dont_grab);
         }
 
         constexpr static inline auto AssetType = ET_COMPUTE_PIPELINE;
         inline E_TYPE getAssetType() const override { return AssetType; }
-        
-		//!
-		inline size_t getDependantCount() const override {return 2;}
 
-        // provide default arg
-        inline IPipelineBase::SShaderSpecInfo getSpecInfo() const {return base_t::getSpecInfo(hlsl::ShaderStage::ESS_COMPUTE);}
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override
+        {
+            if (stage==hlsl::ShaderStage::ESS_COMPUTE)
+                return {&m_specInfo,1};
+            return {};
+        }
+
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
+        inline SShaderSpecInfo& getSpecInfo()
+        {
+            return m_specInfo;
+        }
+
+        inline const SShaderSpecInfo& getSpecInfo() const
+        {
+            return m_specInfo;
+        }
+
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams()
+        {
+            assert(isMutable());
+            return m_params;
+        }
+
+        inline bool valid() const override
+        {
+            if (!m_layout) return false;
+            if (!m_layout->valid()) return false;
+            return m_specInfo.valid();
+        }
 
     protected:
         using base_t::base_t;
         virtual ~ICPUComputePipeline() = default;
 
-        base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override 
-        {
-            return new ICPUComputePipeline(std::move(layout));
-        }
-        
-		inline IAsset* getDependant_impl(const size_t ix) override
+
+    private:
+        SShaderSpecInfo m_specInfo;
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
         {
-            if (ix!=0)
-                return m_stages[0].shader.get();
-            return const_cast<ICPUPipelineLayout*>(m_layout.get());
+            auto newPipeline = new ICPUComputePipeline(layout.get());
+            newPipeline->m_specInfo = m_specInfo.clone(depth);
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
         }
 
-        inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override
+        explicit ICPUComputePipeline(ICPUPipelineLayout* layout):
+          base_t(layout, {})
+          {}
+        
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
         {
-            return stage!=hlsl::ShaderStage::ESS_COMPUTE ? (-1):0;
+            if (!visit(m_layout.get())) return;
+            if (!visit(m_specInfo.shader.get())) return;
         }
 };
 
diff --git a/include/nbl/asset/ICPUDescriptorSet.h b/include/nbl/asset/ICPUDescriptorSet.h
index 826c54cc39..4247283c0e 100644
--- a/include/nbl/asset/ICPUDescriptorSet.h
+++ b/include/nbl/asset/ICPUDescriptorSet.h
@@ -47,8 +47,6 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 		constexpr static inline auto AssetType = ET_DESCRIPTOR_SET;
 		inline E_TYPE getAssetType() const override {return AssetType;}
 
-		inline size_t getDependantCount() const override {return m_layout->getTotalBindingCount()+1;}
-
 		//
 		inline ICPUDescriptorSetLayout* getLayout() 
 		{
@@ -79,14 +77,74 @@ class NBL_API2 ICPUDescriptorSet final : public IDescriptorSet<ICPUDescriptorSet
 
 		core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override;
 
+		inline bool valid() const override {
+			if (!m_layout->valid()) return false;
+			for (auto type_i = 0u; type_i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); type_i++)
+			{
+				const auto descriptorType = static_cast<IDescriptor::E_TYPE>(type_i);
+				const auto descriptorCategory = IDescriptor::GetTypeCategory(descriptorType);
+				const auto& descriptorRedirect = m_layout->getDescriptorRedirect(descriptorType);
+				const auto& descriptorInfoArr = m_descriptorInfos[type_i];
+
+				if (descriptorInfoArr->size() != descriptorRedirect.getTotalCount()) return false;
+
+				auto offset = 0;
+				for (auto binding_i = 0; binding_i < descriptorRedirect.getBindingCount(); binding_i++)
+				{
+					const auto storageIndex = IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t(binding_i);
+					const auto descriptorCount = descriptorRedirect.getCount(storageIndex);
+					const auto createFlags = descriptorRedirect.getCreateFlags(storageIndex);
+					const auto isPartiallyBound = !createFlags.hasFlags(IDescriptorSetLayoutBase::SBindingBase::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT);
+					for (auto descriptor_i = 0; descriptor_i < descriptorCount; descriptor_i++)
+					{
+						const auto& descriptorInfo = descriptorInfoArr->operator[](offset);
+
+						// partiallyBound layout can have null descriptor, otherwise not
+						if (!isPartiallyBound && !descriptorInfo.desc) return false;
+						if (descriptorInfo.desc && descriptorInfo.desc->getTypeCategory() != descriptorCategory) return false;
+					}
+				}
+			}
+
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUDescriptorSet() = default;
 
-		IAsset* getDependant_impl(size_t ix) override;
 
 	private:
 
 		core::smart_refctd_dynamic_array<ICPUDescriptorSet::SDescriptorInfo> m_descriptorInfos[static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT)];
+
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		{
+				for (auto i = 0u; i < static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
+				{
+					if (!m_descriptorInfos[i]) continue;
+					const auto size = m_descriptorInfos[i]->size();
+					for (auto desc_i = 0u; desc_i < size; desc_i++)
+					{
+						auto* desc = m_descriptorInfos[i]->operator[](desc_i).desc.get();
+						if (!desc) continue;
+						switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
+						{
+						case IDescriptor::EC_BUFFER:
+							if (!visit(static_cast<const ICPUBuffer*>(desc))) return;
+						case IDescriptor::EC_SAMPLER:
+							if (!visit(static_cast<const ICPUSampler*>(desc))) return;
+						case IDescriptor::EC_IMAGE:
+							if (!visit(static_cast<const ICPUImageView*>(desc))) return;
+						case IDescriptor::EC_BUFFER_VIEW:
+							if (!visit(static_cast<ICPUBufferView*>(desc))) return;
+						case IDescriptor::EC_ACCELERATION_STRUCTURE:
+							if (!visit(static_cast<ICPUTopLevelAccelerationStructure*>(desc))) return;
+						default:
+							break;
+						}
+					}
+				}
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUDescriptorSetLayout.h b/include/nbl/asset/ICPUDescriptorSetLayout.h
index 8f45a789ea..a46bb55808 100644
--- a/include/nbl/asset/ICPUDescriptorSetLayout.h
+++ b/include/nbl/asset/ICPUDescriptorSetLayout.h
@@ -56,16 +56,23 @@ class ICPUDescriptorSetLayout : public IDescriptorSetLayout<ICPUSampler>, public
 
         constexpr static inline auto AssetType = ET_DESCRIPTOR_SET_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
-
-		inline size_t getDependantCount() const override {return m_immutableSamplers ? m_immutableSamplers->size():0;}
+        inline bool valid() const override
+        {
+            return true; // no modification is possible after creation
+        }
 
 	protected:
 		virtual ~ICPUDescriptorSetLayout() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            return m_immutableSamplers->operator[](ix).get();
-        }
+      
+  private:
+
+      inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+      {
+          if (!m_immutableSamplers) return;
+          for (const auto& sampler : *m_immutableSamplers)
+              if (!visit(sampler.get())) return;
+      }
 };
 
 }
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 2643db7550..acc990f18c 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -13,91 +13,127 @@
 namespace nbl::asset
 {
 
-class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>,5u>
+class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>>
 {
-        using pipeline_base_t = IGraphicsPipeline<ICPUPipelineLayout,ICPURenderpass>;
-        using base_t = ICPUPipeline<pipeline_base_t,5u>;
+        using pipeline_base_t = IGraphicsPipeline<ICPUPipelineLayout, ICPURenderpass>;
+        using base_t = ICPUPipeline<pipeline_base_t>;
 
     public:
-		struct SCreationParams final : pipeline_base_t::SCreationParams
-		{
-			private:
-				friend class ICPUGraphicsPipeline;
-				template<typename ExtraLambda>
-				inline bool impl_valid(ExtraLambda&& extra) const
-				{
-					return pipeline_base_t::SCreationParams::impl_valid(std::move(extra));
-				}
-		};
-		static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(const SCreationParams& params)
-		{
-			// we'll validate the specialization info later when attempting to set it
-            if (!params.impl_valid([](const IPipelineBase::SShaderSpecInfo& info)->bool{return true;}))
-                return nullptr;
-            auto retval = new ICPUGraphicsPipeline(params);
-            for (const auto spec : params.shaders)
-            if (spec.shader)
-				retval->setSpecInfo(spec);
+        
+        static core::smart_refctd_ptr<ICPUGraphicsPipeline> create(ICPUPipelineLayout* layout, ICPURenderpass* renderpass = nullptr)
+        {
+            auto retval = new ICPUGraphicsPipeline(layout, renderpass);
             return core::smart_refctd_ptr<ICPUGraphicsPipeline>(retval,core::dont_grab);
-		}
-
-		constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
-		inline E_TYPE getAssetType() const override { return AssetType; }
-		
-		inline size_t getDependantCount() const override
-		{
-			auto stageCount = 2; // the layout and renderpass
-			for (const auto& stage : m_stages)
-			if (stage.shader)
-				stageCount++;
-			return stageCount;
-		}
-
-		// extras for this class
-		inline const SCachedCreationParams& getCachedCreationParams() const {return base_t::getCachedCreationParams();}
+        }
+
+        constexpr static inline auto AssetType = ET_GRAPHICS_PIPELINE;
+        inline E_TYPE getAssetType() const override { return AssetType; }
+        
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
         inline SCachedCreationParams& getCachedCreationParams()
         {
             assert(isMutable());
             return m_params;
         }
 
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override final
+        {
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return { &m_specInfos[stageIndex], 1 };
+            return {};
+        }
+
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
+        SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage)
+        {
+            if (!isMutable()) return nullptr;
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return &m_specInfos[stageIndex];
+            return nullptr;
+        }
+
+        const SShaderSpecInfo* getSpecInfo(const hlsl::ShaderStage stage) const
+        {
+            const auto stageIndex = stageToIndex(stage);
+            if (stageIndex != -1)
+                return &m_specInfos[stageIndex];
+            return nullptr;
+        }
+
+        inline bool valid() const override
+        {
+            if (!m_layout) return false;
+            if (!m_layout->valid())return false;
+
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+            if (!m_renderpass || m_params.subpassIx >= m_renderpass->getSubpassCount()) return false;
+            
+            core::bitflag<hlsl::ShaderStage> stagePresence = {};
+            for (auto shader_i = 0u; shader_i < m_specInfos.size(); shader_i++)
+            {
+                const auto& info = m_specInfos[shader_i];
+                if (info.shader)
+                    stagePresence |= indexToStage(shader_i);
+            }
+            return hasRequiredStages(stagePresence, m_params.primitiveAssembly.primitiveType);
+        }
+
     protected:
-		using base_t::base_t;
-        ~ICPUGraphicsPipeline() = default;
-
-		base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override
-		{
-			std::array<IPipelineBase::SShaderSpecInfo,GRAPHICS_SHADER_STAGE_COUNT> _shaders;
-			for (auto i=0; i<GRAPHICS_SHADER_STAGE_COUNT; i++)
-				_shaders[i] = m_stages[i].info;
-			const SCreationParams params = {{
-				.shaders = _shaders,
-				.cached = m_params,
-				.renderpass = m_renderpass.get()
-			}};
-			return new ICPUGraphicsPipeline(params);
-		}
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			if (ix==0)
-				return const_cast<ICPUPipelineLayout*>(m_layout.get());
-			if (ix==1)
-				return m_renderpass.get();
-			size_t stageCount = 0;
-			for (auto& stage : m_stages)
-			if (stage.shader)
-			if ((stageCount++)==ix-2)
-				return stage.shader.get();
-			return nullptr;
-		}
-
-		inline int8_t stageToIndex(const hlsl::ShaderStage stage) const override
-		{
-			const auto stageIx = hlsl::findLSB(stage);
-			if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
-				return -1;
-			return stageIx;
-		}
+        using base_t::base_t;
+        virtual ~ICPUGraphicsPipeline() override = default;
+
+        std::array<SShaderSpecInfo, GRAPHICS_SHADER_STAGE_COUNT> m_specInfos;
+
+    private:
+        explicit ICPUGraphicsPipeline(ICPUPipelineLayout* layout, ICPURenderpass* renderpass)
+            : base_t(layout, {}, renderpass)
+            {}
+
+        static inline int8_t stageToIndex(const hlsl::ShaderStage stage)
+        {
+            const auto stageIx = hlsl::findLSB(stage);
+            if (stageIx < 0 || stageIx >= GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
+              return -1;
+            return stageIx;
+        }
+
+        static inline hlsl::ShaderStage indexToStage(const int8_t index)
+        {
+            if (index < 0 || index > GRAPHICS_SHADER_STAGE_COUNT)
+                return hlsl::ShaderStage::ESS_UNKNOWN;
+            return static_cast<hlsl::ShaderStage>(hlsl::ShaderStage::ESS_VERTEX + index);
+        }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto* newPipeline = new ICPUGraphicsPipeline(layout.get(), m_renderpass.get());
+            newPipeline->m_params = m_params;
+            
+            for (auto specInfo_i = 0u; specInfo_i < m_specInfos.size(); specInfo_i++)
+            {
+                newPipeline->m_specInfos[specInfo_i] = m_specInfos[specInfo_i].clone(depth);
+            }
+
+            return core::smart_refctd_ptr<base_t>(newPipeline, core::dont_grab);
+        }
+
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        {
+            if (!visit(m_layout.get())) return;
+            if (!visit(m_renderpass.get())) return;
+            for (const auto& info : m_specInfos)
+              if (!visit(info.shader.get())) return;
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUImage.h b/include/nbl/asset/ICPUImage.h
index c27cd21b86..fdbf640557 100644
--- a/include/nbl/asset/ICPUImage.h
+++ b/include/nbl/asset/ICPUImage.h
@@ -45,9 +45,6 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 		constexpr static inline auto AssetType = ET_IMAGE;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		// Do not report buffer as dependant, as we will simply drop it instead of discarding its contents!
-		inline size_t getDependantCount() const override {return 0;}
-
 		core::blake3_hash_t computeContentHash() const override;
 
 		// Having regions specififed to upload is optional! So to have content missing we must have regions but no buffer content
@@ -198,12 +195,21 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 			return true;
 		}
 
+    inline bool valid() const override
+    {
+      if (!validateCreationParameters(m_creationParams)) return false;
+      if (info != m_creationParams.format) return false;
+      if (buffer && !buffer->valid()) return false;
+      if (regions)
+        for (const auto& region : *regions)
+          if (!region.isValid()) return false;
+      return true;
+    }
+
     protected:
 		inline ICPUImage(const SCreationParams& _params) : IImage(_params) {}
 		virtual ~ICPUImage() = default;
 		
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		inline void discardContent_impl() override
 		{
 			buffer = nullptr;
@@ -221,6 +227,10 @@ class NBL_API2 ICPUImage final : public IImage, public IPreHashed
 				return _a.imageSubresource.mipLevel < _b.imageSubresource.mipLevel;
 			}
 		};
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 } // end namespace nbl::asset
diff --git a/include/nbl/asset/ICPUImageView.h b/include/nbl/asset/ICPUImageView.h
index 87df463021..85a0629cc3 100644
--- a/include/nbl/asset/ICPUImageView.h
+++ b/include/nbl/asset/ICPUImageView.h
@@ -49,9 +49,6 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 		constexpr static inline auto AssetType = ET_IMAGE_VIEW;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
 
-		//!
-		inline size_t getDependantCount() const override {return 1;}
-
 		//!
 		const SComponentMapping& getComponents() const { return params.components; }
 		SComponentMapping&	getComponents() 
@@ -65,13 +62,26 @@ class ICPUImageView final : public IImageView<ICPUImage>, public IAsset
 			params.subresourceRange.aspectMask = aspect.value;
 		}
 
+    inline bool valid() const override
+		{
+			if (!validateCreationParameters(params)) return false;
+
+			// image nullptr already checked in validateCreationParameters;
+			assert(params.image);
+			if (!params.image->valid()) return false;
+
+			return true;
+		}
+
 	protected:
 		virtual ~ICPUImageView() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override
-		{
-			return params.image.get();
-		}
+  private:
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(params.image.get())) return;
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMesh.h b/include/nbl/asset/ICPUMesh.h
index a21f5f3f02..df647b14a4 100644
--- a/include/nbl/asset/ICPUMesh.h
+++ b/include/nbl/asset/ICPUMesh.h
@@ -81,14 +81,24 @@ class ICPUMesh final : public IMesh<ICPUMeshBuffer>, public IAsset
             return cp;
         }
 
-        //! CLASS IS DEPRECATED ANYWAY
-		inline size_t getDependantCount() const override {return 0;}
+    inline bool valid() const override
+    {
+      for (const auto& meshBuffer : m_meshBuffers)
+      {
+        if (!meshBuffer) return false;
+        if (!meshBuffer->valid()) return false;
+      }
+      return true;
+    }
 
 	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
 
 	private:
 		core::vector<core::smart_refctd_ptr<ICPUMeshBuffer>> m_meshBuffers;
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUMeshBuffer.h b/include/nbl/asset/ICPUMeshBuffer.h
index 532b622090..aa6cbc9429 100644
--- a/include/nbl/asset/ICPUMeshBuffer.h
+++ b/include/nbl/asset/ICPUMeshBuffer.h
@@ -610,12 +610,15 @@ class ICPUMeshBuffer final : public IMeshBuffer<ICPUBuffer,ICPUDescriptorSet,ICP
             assert(isMutable());
             return const_cast<core::aabbox3df*>(const_cast<const ICPUMeshBuffer*>(this)->getJointAABBs());
         }
+        inline bool valid() const override
+        {
+            return true;
+        }
 
-        //! CLASS IS DEPRECATED ANYWAY
-		inline size_t getDependantCount() const override {return 0;}
-
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
+    private:
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        {
+        }
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index d1693f18eb..e9442e0b8c 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -13,38 +13,98 @@
 namespace nbl::asset
 {
 
-// Common Base class for pipelines
-template<typename PipelineNonAssetBase, uint8_t MaxShaderStageCount>
-class ICPUPipeline : public IAsset, public PipelineNonAssetBase
+class ICPUPipelineBase
 {
-        using this_t = ICPUPipeline<PipelineNonAssetBase,MaxShaderStageCount>;
-
     public:
-        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
+        struct SShaderSpecInfo
         {
-            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
-            if (_depth>0u && PipelineNonAssetBase::m_layout)
-				layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(PipelineNonAssetBase::m_layout->clone(_depth-1u));
+            //! Structure specifying a specialization map entry
+            /*
+              Note that if specialization constant ID is used
+              in a shader, \bsize\b and \boffset'b must match
+              to \isuch an ID\i accordingly.
+
+              By design the API satisfies:
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
+            */
+            //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
+            using spec_constant_id_t = uint32_t;
+
+            using SSpecConstantValue = core::vector<uint8_t>;
+
+            inline SSpecConstantValue* getSpecializationByteValue(const spec_constant_id_t _specConstID)
+            {
+                const auto found = entries.find(_specConstID);
+                if (found != entries.end() && found->second.size()) return &found->second;
+                else return nullptr;
+            }
+
+            static constexpr int32_t INVALID_SPEC_INFO = -1;
+            inline int32_t valid() const
+            {
+                if (!shader) return INVALID_SPEC_INFO;
 
-            auto cp = clone_impl(std::move(layout));
-            for (auto i=0; i<MaxShaderStageCount; i++)
+                // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
+                if (entryPoint.empty()) return INVALID_SPEC_INFO;
+
+                // Impossible to efficiently check anything from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
+                // and from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
+
+                int64_t specData = 0;
+                for (const auto& entry : entries)
+                {
+                    if (!entry.second.size()) return INVALID_SPEC_INFO;
+                    specData += entry.second.size();
+                }
+                if (specData > 0x7fffffff) return INVALID_SPEC_INFO;
+                return static_cast<int32_t>(specData);
+            }
+
+            core::smart_refctd_ptr<IShader> shader = nullptr;
+            std::string entryPoint = "";
+
+            IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+
+            using spec_constant_map_t = core::unordered_map<spec_constant_id_t, SSpecConstantValue>;
+            // Container choice implicitly satisfies:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
+            spec_constant_map_t entries;
+            // By requiring Nabla Core Profile features we implicitly satisfy:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
+            // Also because our API is sane, it satisfies the following by construction:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
+
+            SShaderSpecInfo clone(uint32_t depth) const
             {
-                const auto shader = m_stages[i].shader;
-                if (shader)
+                auto newSpecInfo = *this;
+                if (newSpecInfo.shader.get() != nullptr && depth > 0u)
                 {
-                    auto stageInfo = m_stages[i].info;
-                    core::smart_refctd_ptr<IShader> newShader;
-                    if (_depth>0u)
-                    {
-                        newShader = core::smart_refctd_ptr_static_cast<IShader>(shader->clone(_depth-1u));
-                        stageInfo.shader = newShader.get();
-                    }
-                    cp->setSpecInfo(stageInfo);
+                    newSpecInfo.shader = core::smart_refctd_ptr_static_cast<IShader>(this->shader->clone(depth - 1u));
                 }
+                return newSpecInfo;
             }
+        };
 
-            return core::smart_refctd_ptr<this_t>(cp,core::dont_grab);
-        }
+        virtual std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const = 0;
+
+};
+
+// Common Base class for pipelines
+template<typename PipelineNonAssetBase>
+    requires (std::is_base_of_v<IPipeline<ICPUPipelineLayout>, PipelineNonAssetBase> && !std::is_base_of_v<IAsset, PipelineNonAssetBase>)
+class ICPUPipeline : public IAsset, public PipelineNonAssetBase, public ICPUPipelineBase
+{
+        using this_t = ICPUPipeline<PipelineNonAssetBase>;
+
+    public:
 
         // extras for this class
         ICPUPipelineLayout* getLayout() 
@@ -60,82 +120,34 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase
             PipelineNonAssetBase::m_layout = std::move(_layout);
         }
 
-        // The getters are weird because the shader pointer, spec constant map and entry point needs patching
-        inline IShader* getShader(const hlsl::ShaderStage stage)
-        {
-            assert(isMutable());
-            return const_cast<IShader*>(getSpecInfo(stage).shader);
-        }
-		inline std::string* getEntryPoint(const hlsl::ShaderStage stage)
-		{
-			const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return {};
-			return &m_stages[stageIx].entryPoint;
-		}
-        inline IPipelineBase::SShaderSpecInfo::spec_constant_map_t* getSpecConstantMap(const hlsl::ShaderStage stage)
+        inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth = ~0u) const override final
         {
-            assert(isMutable());
-            return const_cast<IPipelineBase::SShaderSpecInfo::spec_constant_map_t*>(getSpecInfo(stage).entries);
+            if (!getLayout()) return nullptr;
+
+            core::smart_refctd_ptr<ICPUPipelineLayout> layout;
+            if (_depth > 0u) 
+              layout = core::smart_refctd_ptr_static_cast<ICPUPipelineLayout>(getLayout()->clone(_depth - 1u));
+
+            return clone_impl(std::move(layout), _depth);
         }
-        //
-		inline IPipelineBase::SShaderSpecInfo getSpecInfo(const hlsl::ShaderStage stage) const
-		{
-			const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return {};
-			return m_stages[stageIx].info;
-		}
-		inline bool setSpecInfo(const IPipelineBase::SShaderSpecInfo& info)
-		{
-			assert(isMutable());
-            const int64_t specSize = info.valid();
-            if (specSize<0)
-                return false;
-			const auto stageIx = stageToIndex(info.stage);
-			if (stageIx<0)
-				return false;
-            auto& outStage = m_stages[stageIx];
-			outStage.info = info;
-            outStage.entryPoint = info.entryPoint;
-			outStage.shader = core::smart_refctd_ptr<IShader>(const_cast<IShader*>(info.shader));
-			outStage.info.shader = outStage.shader.get();
-            auto& outEntries = outStage.entries;
-            if (specSize>0)
-            {
-                outEntries = std::make_unique<IPipelineBase::SShaderSpecInfo::spec_constant_map_t>();
-                outEntries->reserve(info.entries->size());
-                std::copy(info.entries->begin(),info.entries->end(),std::insert_iterator(*outEntries,outEntries->begin()));
-            }
-            else
-                outEntries = nullptr;
-			outStage.info.entries = outEntries.get();
-			return true;
-		}
-        inline bool clearStage(const hlsl::ShaderStage stage)
+
+        // Note(kevinyu): For some reason overload resolution cannot find this function when I name id getSpecInfos. It always use the const variant. Will check on it later.
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
         {
-            assert(isMutable());
-            const auto stageIx = stageToIndex(stage);
-            if (stageIx<0)
-                return false;
-            m_stages[stageIx] = {};
-            return true;
+            if (!isMutable()) return {};
+            const this_t* constPipeline = const_cast<const this_t*>(this);
+            const ICPUPipelineBase* basePipeline = constPipeline;
+            const auto specInfo = basePipeline->getSpecInfos(stage);
+            return { const_cast<SShaderSpecInfo*>(specInfo.data()), specInfo.size() };
         }
 
     protected:
+
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
+        
+        virtual core::smart_refctd_ptr<this_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const = 0;
 
-        virtual this_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const = 0;
-        virtual int8_t stageToIndex(const hlsl::ShaderStage stage) const = 0;
-
-        struct ShaderStage
-        {
-            std::string entryPoint = {};
-            core::smart_refctd_ptr<IShader> shader = {};
-            std::unique_ptr<IPipelineBase::SShaderSpecInfo::spec_constant_map_t> entries = {};
-            IPipelineBase::SShaderSpecInfo info = {};
-        } m_stages[MaxShaderStageCount] = {};
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipelineCache.h b/include/nbl/asset/ICPUPipelineCache.h
index 0c1d8c17cf..c5511f39bb 100644
--- a/include/nbl/asset/ICPUPipelineCache.h
+++ b/include/nbl/asset/ICPUPipelineCache.h
@@ -60,8 +60,6 @@ class ICPUPipelineCache final : public IPreHashed
 			return core::make_smart_refctd_ptr<ICPUPipelineCache>(std::move(cache_cp));
 		}
 
-		inline size_t getDependantCount() const override {return 0;}
-
 		//
 		inline core::blake3_hash_t computeContentHash() const override
 		{
@@ -85,9 +83,12 @@ class ICPUPipelineCache final : public IPreHashed
 		//
 		const auto& getEntries() const {return m_cache;}
 
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
+		inline bool valid() const override
+		{
+			return true;
+		}
 
+	protected:
 		inline void discardContent_impl() override
 		{
 			for (auto& entry : m_cache)
@@ -96,6 +97,10 @@ class ICPUPipelineCache final : public IPreHashed
 
 	private:
 		entries_map_t m_cache;
+
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+		{
+		}
 };
 
 }
diff --git a/include/nbl/asset/ICPUPipelineLayout.h b/include/nbl/asset/ICPUPipelineLayout.h
index c4a76fdea9..b30ecc3e10 100644
--- a/include/nbl/asset/ICPUPipelineLayout.h
+++ b/include/nbl/asset/ICPUPipelineLayout.h
@@ -30,16 +30,6 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
             core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<ICPUDescriptorSetLayout>&& _layout3
         ) : IPipelineLayout<ICPUDescriptorSetLayout>(_pcRanges,std::move(_layout0),std::move(_layout1),std::move(_layout2),std::move(_layout3)) {}
 
-        //
-        inline size_t getDependantCount() const override
-        {
-            size_t count = 0;
-            for (auto i=0; i<m_descSetLayouts.size(); i++)
-            if (m_descSetLayouts[i])
-                count++;
-            return count;
-        }
-
         //
 		ICPUDescriptorSetLayout* getDescriptorSetLayout(uint32_t _set) 
         {
@@ -76,17 +66,28 @@ class ICPUPipelineLayout : public IAsset, public IPipelineLayout<ICPUDescriptorS
         static inline constexpr auto AssetType = ET_PIPELINE_LAYOUT;
         inline E_TYPE getAssetType() const override { return AssetType; }
 
+        inline bool valid() const override
+        {
+            for (auto i = 0; i < m_descSetLayouts.size(); i++)
+            {
+                if (!m_descSetLayouts[i]) continue;
+                if (!m_descSetLayouts[i]->valid()) return false;
+            }
+            return true;
+        }
+
     protected:
 		virtual ~ICPUPipelineLayout() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override
-        {
-            size_t count = 0;
-            for (auto i=0; i<m_descSetLayouts.size(); i++)
-            if ((count++)==ix)
-                return m_descSetLayouts[ix].get();
-            return nullptr;
-        }
+      inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+      {
+          for (auto i = 0; i < m_descSetLayouts.size(); i++)
+          {
+              if (!m_descSetLayouts[i]) continue;
+              if (!visit(m_descSetLayouts[i].get())) return;
+          }
+      }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPURayTracingPipeline.h b/include/nbl/asset/ICPURayTracingPipeline.h
new file mode 100644
index 0000000000..17c53557e1
--- /dev/null
+++ b/include/nbl/asset/ICPURayTracingPipeline.h
@@ -0,0 +1,153 @@
+
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_
+#define _NBL_ASSET_I_CPU_RAY_TRACING_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IRayTracingPipeline.h"
+#include "nbl/asset/ICPUPipeline.h"
+
+
+namespace nbl::asset
+{
+
+//! CPU Version of RayTracing Pipeline
+class ICPURayTracingPipeline final : public ICPUPipeline<IRayTracingPipeline<ICPUPipelineLayout>>
+{
+        using pipeline_base_t = IRayTracingPipeline<ICPUPipelineLayout>;
+        using base_t = ICPUPipeline<pipeline_base_t>;
+
+    public:
+        struct SHitGroupSpecInfos {
+            core::vector<SShaderSpecInfo> closestHits;
+            core::vector<SShaderSpecInfo> anyHits;
+            core::vector<SShaderSpecInfo> intersections;
+        };
+
+        static core::smart_refctd_ptr<ICPURayTracingPipeline> create(const ICPUPipelineLayout* layout)
+        {
+            auto retval = new ICPURayTracingPipeline(layout);
+            return core::smart_refctd_ptr<ICPURayTracingPipeline>(retval,core::dont_grab);
+        }
+
+        
+
+        constexpr static inline auto AssetType = ET_RAYTRACING_PIPELINE;
+        inline E_TYPE getAssetType() const override { return AssetType; }
+        
+        inline std::span<const SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage) const override
+        {
+            switch (stage) 
+            {
+                case hlsl::ShaderStage::ESS_RAYGEN:
+                  return { &m_raygen, 1 };
+                case hlsl::ShaderStage::ESS_MISS:
+                  return m_misses;
+                case hlsl::ShaderStage::ESS_ANY_HIT:
+                  return m_hitGroups.anyHits;
+                case hlsl::ShaderStage::ESS_CLOSEST_HIT:
+                  return m_hitGroups.closestHits;
+                case hlsl::ShaderStage::ESS_INTERSECTION:
+                  return m_hitGroups.intersections;
+                case hlsl::ShaderStage::ESS_CALLABLE:
+                  return m_callables;
+
+            }
+            return {};
+        }
+
+        inline std::span<SShaderSpecInfo> getSpecInfos(const hlsl::ShaderStage stage)
+        {
+            return base_t::getSpecInfos(stage);
+        }
+
+        inline core::vector<SShaderSpecInfo>* getSpecInfoVector(const hlsl::ShaderStage stage)
+        {
+            if (!isMutable()) return nullptr;
+            switch (stage) 
+            {
+                // raygen is not stored as vector so we can't return it here. Use getSpecInfo
+                case hlsl::ShaderStage::ESS_MISS:
+                  return &m_misses;
+                case hlsl::ShaderStage::ESS_ANY_HIT:
+                  return &m_hitGroups.anyHits;
+                case hlsl::ShaderStage::ESS_CLOSEST_HIT:
+                  return &m_hitGroups.closestHits;
+                case hlsl::ShaderStage::ESS_INTERSECTION:
+                  return &m_hitGroups.intersections;
+                case hlsl::ShaderStage::ESS_CALLABLE:
+                  return &m_callables;
+
+            }
+            return nullptr;
+        }
+
+
+        inline bool valid() const override final
+        {
+            if (!m_layout) return false;
+            if (!m_layout->valid()) return false;
+            if (m_raygen.valid() == SShaderSpecInfo::INVALID_SPEC_INFO) return false;
+            return true;
+        }
+
+        inline const SCachedCreationParams& getCachedCreationParams() const
+        {
+            return pipeline_base_t::getCachedCreationParams();
+        }
+
+        inline SCachedCreationParams& getCachedCreationParams() {
+            assert(isMutable());
+            return m_params;
+        }
+
+    protected:
+        virtual ~ICPURayTracingPipeline() = default;
+
+    private:
+        
+        SShaderSpecInfo m_raygen;
+        core::vector<SShaderSpecInfo> m_misses;
+        SHitGroupSpecInfos m_hitGroups;
+        core::vector<SShaderSpecInfo> m_callables;
+
+        explicit ICPURayTracingPipeline(const ICPUPipelineLayout* layout)
+            : base_t(layout, {})
+            {}
+
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        {
+            if (!visit(m_raygen.shader.get()) return;
+            for (const auto& missInfo : self->m_misses) if (!visit(missInfo.shader.get())) return;
+            for (const auto& anyHitInfo : self->m_hitGroups.anyHits) if (!visit(anyHitInfo.shader.get())) return;
+            for (const auto& closestHitInfo : self->m_hitGroups.closestHits) if (!visit(closestHitInfo.shader.get())) return;
+            for (const auto& intersectionInfo : self->m_hitGroups.intersections) if (!visit(intersectionInfo.shader.get())) return;
+            for (const auto& callableInfo : self->m_callables) if(!visit(callableInfo.shader.get())) return;
+        }
+
+        inline core::smart_refctd_ptr<base_t> clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout, uint32_t depth) const override final
+        {
+            auto newPipeline = new ICPURayTracingPipeline(layout.get());
+            newPipeline->m_raygen = m_raygen.clone(depth);
+
+            auto cloneSpecInfos = [depth](const core::vector<SShaderSpecInfo>& specInfos) -> core::vector<SShaderSpecInfo> {
+                core::vector<SShaderSpecInfo> results;
+                results.resize(specInfos.size());
+                for (auto specInfo_i = 0u; specInfo_i < specInfos.size(); specInfo_i++)
+                    results[specInfo_i] = specInfos[specInfo_i].clone(depth);
+                return results;
+            };
+            newPipeline->m_misses = cloneSpecInfos(m_misses);
+            newPipeline->m_hitGroups.anyHits = cloneSpecInfos(m_hitGroups.anyHits);
+            newPipeline->m_hitGroups.closestHits = cloneSpecInfos(m_hitGroups.closestHits);
+            newPipeline->m_hitGroups.intersections = cloneSpecInfos(m_hitGroups.intersections);
+            newPipeline->m_callables = cloneSpecInfos(m_callables);
+
+            newPipeline->m_params = m_params;
+            return core::smart_refctd_ptr<base_t>(newPipeline);
+        }
+};
+
+}
+#endif
diff --git a/include/nbl/asset/ICPURenderpass.h b/include/nbl/asset/ICPURenderpass.h
index b9cf31d127..daaa5c62b0 100644
--- a/include/nbl/asset/ICPURenderpass.h
+++ b/include/nbl/asset/ICPURenderpass.h
@@ -38,13 +38,22 @@ class ICPURenderpass : public IRenderpass, public IAsset
             return ET_RENDERPASS;
         }
 
-        inline size_t getDependantCount() const override {return 0ull;}
+        inline bool valid() const override
+        {
+            // no modification is possible after creation. parameter is validated when creating renderpass
+            return true;
+        }
 
     protected:
         inline ICPURenderpass(const SCreationParams& _params, const SCreationParamValidationResult& _validation) : IRenderpass(_params, _validation) {}
         inline ~ICPURenderpass() = default;
 
-        inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
+    private:
+
+        inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+        {
+        }
+
 };
 
 }
diff --git a/include/nbl/asset/ICPURenderpassIndependentPipeline.h b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
index ed0171d11f..3d67af23d0 100644
--- a/include/nbl/asset/ICPURenderpassIndependentPipeline.h
+++ b/include/nbl/asset/ICPURenderpassIndependentPipeline.h
@@ -19,6 +19,12 @@ namespace nbl::asset
 class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline, public IAsset
 {
 	public:
+    struct SCreationParams
+    {
+        std::span<const ICPUPipelineBase::SShaderSpecInfo> shaders = {};
+        SCachedCreationParams cached = {};
+    };
+
 		//(TODO) it is true however it causes DSs to not be cached when ECF_DONT_CACHE_TOP_LEVEL is set which isnt really intuitive
 		constexpr static inline uint32_t DESC_SET_HIERARCHYLEVELS_BELOW = 0u;
 		// TODO: @Crisspl HOW ON EARTH DOES THIS MAKE SENSE!?
@@ -66,8 +72,6 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 		_NBL_STATIC_INLINE_CONSTEXPR auto AssetType = ET_RENDERPASS_INDEPENDENT_PIPELINE;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 
-		inline size_t getDependantCount() const override {return 0;}
-
 		//
 		inline const SCachedCreationParams& getCachedCreationParams() const {return IRenderpassIndependentPipeline::getCachedCreationParams();}
 		inline SCachedCreationParams& getCachedCreationParams()
@@ -89,9 +93,14 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			m_layout = std::move(_layout);
 		}
 
+    inline bool valid() const override
+    {
+      return m_layout && m_layout->valid();
+    }
+
 #if 0
 		// The getters are weird because the shader pointer needs patching
-		inline IShader::SSpecInfo<ICPUShader> getSpecInfo(const hlsl::ShaderStage stage)
+		inline IShader::SSpecInfo<ICPUShader> getSpecInfos(const hlsl::ShaderStage stage)
 		{
 			assert(isMutable());
 			const auto stageIx = hlsl::findLSB(stage);
@@ -99,7 +108,7 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 				return {};
 			return m_infos[stageIx];
 		}
-		inline IShader::SSpecInfo<const ICPUShader> getSpecInfo(const hlsl::ShaderStage stage) const
+		inline IShader::SSpecInfo<const ICPUShader> getSpecInfos(const hlsl::ShaderStage stage) const
 		{
 			const auto stageIx = hlsl::findLSB(stage);
 			if (stageIx<0 || stageIx>=GRAPHICS_SHADER_STAGE_COUNT || hlsl::bitCount(stage)!=1)
@@ -137,14 +146,18 @@ class ICPURenderpassIndependentPipeline : public IRenderpassIndependentPipeline,
 			: IRenderpassIndependentPipeline(params), m_layout(std::move(_layout)) {}
 		virtual ~ICPURenderpassIndependentPipeline() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 		core::smart_refctd_ptr<ICPUPipelineLayout> m_layout;
 #if 0
 		std::array<core::smart_refctd_ptr<IShader>,GRAPHICS_SHADER_STAGE_COUNT> m_shaders = {};
 		std::array<std::unique_ptr<IPipelineBase::SShaderSpecInfo::spec_constant_map_t>,GRAPHICS_SHADER_STAGE_COUNT> m_entries = {};
 		std::array<IPipelineBase::SShaderSpecInfo,GRAPHICS_SHADER_STAGE_COUNT> m_infos = {};
 #endif
+
+  private:
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUSampler.h b/include/nbl/asset/ICPUSampler.h
index 27a918afaa..6b2bea5219 100644
--- a/include/nbl/asset/ICPUSampler.h
+++ b/include/nbl/asset/ICPUSampler.h
@@ -17,8 +17,6 @@ class ICPUSampler : public ISampler, public IAsset
 	protected:
 		virtual ~ICPUSampler() = default;
         
-		inline IAsset* getDependant_impl(const size_t ix) override {return nullptr;}
-
 	public:
 		ICPUSampler(const SParams& _params) : ISampler(_params), IAsset() {}
         
@@ -70,8 +68,13 @@ class ICPUSampler : public ISampler, public IAsset
 
 		constexpr static inline auto AssetType = ET_SAMPLER;
 		inline IAsset::E_TYPE getAssetType() const override { return AssetType; }
+		inline bool valid() const override { return true; }
 		
-		inline size_t getDependantCount() const override {return 0;}
+  private:
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+    }
 };
 
 }
diff --git a/include/nbl/asset/ICPUSkeleton.h b/include/nbl/asset/ICPUSkeleton.h
index 6f1c576ed8..1049798268 100644
--- a/include/nbl/asset/ICPUSkeleton.h
+++ b/include/nbl/asset/ICPUSkeleton.h
@@ -78,14 +78,14 @@ class ICPUSkeleton final : public ISkeleton<ICPUBuffer>, public IAsset
 
 		constexpr static inline auto AssetType = ET_SKELETON;
 		inline E_TYPE getAssetType() const override { return AssetType; }
+		inline bool valid() const override { return true; }
 
-		//!
-		inline size_t getDependantCount() const override {return 2;}
+  private:
 
-	protected:
-		inline IAsset* getDependant_impl(const size_t ix) override
+		inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
 		{
-			return (ix!=0 ? m_defaultTransforms:m_parentJointIDs).buffer.get();
+        if (!visit(m_defaultTransforms.buffer.get())) return;
+				if (!visit(m_parentJointIDs.buffer.get())) return;
 		}
 };
 
diff --git a/include/nbl/asset/IComputePipeline.h b/include/nbl/asset/IComputePipeline.h
new file mode 100644
index 0000000000..ba4d245473
--- /dev/null
+++ b/include/nbl/asset/IComputePipeline.h
@@ -0,0 +1,40 @@
+#ifndef _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_
+#define _NBL_ASSET_I_COMPUTE_PIPELINE_H_INCLUDED_
+
+#include "nbl/asset/IPipeline.h"
+
+namespace nbl::asset
+{
+
+class IComputePipelineBase : public virtual core::IReferenceCounted
+{
+  public:
+
+    struct SCachedCreationParams final
+    {
+        uint8_t requireFullSubgroups = false;
+    };
+};
+
+template<typename PipelineLayoutType>
+class IComputePipeline : public IPipeline<PipelineLayoutType>, public IComputePipelineBase
+{
+    using base_creation_params_t = IPipeline<PipelineLayoutType>;
+
+  public:
+
+    inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
+
+  protected:
+    explicit IComputePipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
+        m_params(cachedParams)
+    {}
+
+    SCachedCreationParams m_params;
+
+};
+
+}
+
+#endif
diff --git a/include/nbl/asset/IDescriptorSetLayout.h b/include/nbl/asset/IDescriptorSetLayout.h
index 140b8d7485..48d8abab9e 100644
--- a/include/nbl/asset/IDescriptorSetLayout.h
+++ b/include/nbl/asset/IDescriptorSetLayout.h
@@ -340,7 +340,8 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase
 				bindings[i].binding = i;
 				bindings[i].type = type;
 				bindings[i].createFlags = SBinding::E_CREATE_FLAGS::ECF_NONE;
-				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:asset::IShader::ESS_ALL_OR_LIBRARY;
+
+				bindings[i].stageFlags = stageAccessFlags ? stageAccessFlags[i]:hlsl::ShaderStage::ESS_ALL_OR_LIBRARY;
 				bindings[i].count = counts ? counts[i]:1u;
 				bindings[i].samplers = nullptr;
 			}
@@ -364,7 +365,7 @@ class IDescriptorSetLayout : public IDescriptorSetLayoutBase
 				for (uint32_t b = 0u; b < bindingCnt; ++b)
 				{
 					auto bindingNumber = m_descriptorRedirects[t].m_bindingNumbers[b];
-					CBindingRedirect::template binding_number_t otherBindingNumber(CBindingRedirect::Invalid);
+					CBindingRedirect::binding_number_t otherBindingNumber(CBindingRedirect::Invalid);
 					// TODO: std::find instead?
 					for (uint32_t ob = 0u; ob < otherBindingCnt; ++ob)
 					{
diff --git a/include/nbl/asset/IFramebuffer.h b/include/nbl/asset/IFramebuffer.h
index 9c78fe1e42..4f4abb89da 100644
--- a/include/nbl/asset/IFramebuffer.h
+++ b/include/nbl/asset/IFramebuffer.h
@@ -121,7 +121,7 @@ class IFramebuffer
                             return true;
 
                         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-pAttachments-00884
-                        if (viewParams.components!=ImageViewType::SComponentMapping())
+                        if (viewParams.components!=typename ImageViewType::SComponentMapping())
                             return true;
 
                         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFramebufferCreateInfo.html#VUID-VkFramebufferCreateInfo-flags-04533
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index c59ad51ca9..5b445afae5 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -88,78 +88,34 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
         using renderpass_t = RenderpassType;
 
     public:
-        struct SCreationParams : IPipeline<PipelineLayoutType>::SCreationParams
-        {
-            protected:
-                using SpecInfo = IPipelineBase::SShaderSpecInfo;
-                template<typename ExtraLambda>
-                inline bool impl_valid(ExtraLambda&& extra) const
-                {
-                    if (!IPipeline<PipelineLayoutType>::SCreationParams::layout)
-                        return false;
-
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
-                    if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount())
-                        return false;
-
-                    // TODO: check rasterization samples, etc.
-                    //rp->getCreationParameters().subpasses[i]
-
-                    core::bitflag<hlsl::ShaderStage> stagePresence = {};
-                    for (const auto info : shaders)
-                    if (info.shader)
-                    {
-                        if (!extra(info))
-                            return false;
-                        const auto stage = info.stage;
-                        if (stage>hlsl::ShaderStage::ESS_FRAGMENT)
-                            return false;
-                        if (stagePresence.hasFlags(stage))
-                            return false;
-                        stagePresence |= stage;
-                    }
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
-                    if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
-                        return false;
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
-                    if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
-                        return false;
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
-                    if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(cached.primitiveAssembly.primitiveType==EPT_PATCH_LIST))
-                        return false;
-                    
-                    return true;
-                }
-
-            public:
-                inline bool valid() const
-                {
-                    return impl_valid([](const SpecInfo& info)->bool
-                    {
-                        if (!info.valid())
-                            return false;
-                        return false;
-                    });
-                }
-
-                std::span<const SpecInfo> shaders = {};
-                SCachedCreationParams cached = {};
-                renderpass_t* renderpass = nullptr;
-        };
-
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_params;}
-
         inline const renderpass_t* getRenderpass() const {return m_renderpass.get();}
 
+
+        static inline bool hasRequiredStages(const core::bitflag<hlsl::ShaderStage>& stagePresence, E_PRIMITIVE_TOPOLOGY primitiveType)
+        {
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-stage-02096
+            if (!stagePresence.hasFlags(hlsl::ShaderStage::ESS_VERTEX))
+                return false;
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00729
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-00730
+            if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)!=stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION))
+                return false;
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-pStages-08888
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-topology-08889
+            if (stagePresence.hasFlags(hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)!=(primitiveType==asset::EPT_PATCH_LIST))
+                return false;
+            return true;
+        }
+
     protected:
-        explicit IGraphicsPipeline(const SCreationParams& _params) :
-            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(_params.layout)),
-            m_params(_params.cached), m_renderpass(core::smart_refctd_ptr<renderpass_t>(_params.renderpass)) {}
+        explicit IGraphicsPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams, renderpass_t* renderpass) :
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
+            m_params(cachedParams), m_renderpass(core::smart_refctd_ptr<renderpass_t>(renderpass))
+        {}
 
-        SCachedCreationParams m_params;
-        core::smart_refctd_ptr<renderpass_t> m_renderpass;
+        SCachedCreationParams m_params = {};
+        core::smart_refctd_ptr<renderpass_t> m_renderpass = nullptr;
 };
 
 }
diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 036a684729..eb54542403 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -27,249 +27,113 @@ namespace nbl::asset
 */
 class IPipelineBase
 {
-	public:
-		struct SCreationParams
-		{
-			protected:
-				// This is not public to make sure that different pipelines only get the enums they support
-				enum class FLAGS : uint64_t
-				{
-					NONE = 0, // disallowed in maintanance5
-					DISABLE_OPTIMIZATIONS = 1<<0,
-					ALLOW_DERIVATIVES = 1<<1,
-					
-					// I can just derive this
-					//DERIVATIVE = 1<<2,
+    public:
+      enum class CreationFlags : uint64_t
+      {
+        NONE = 0, // disallowed in maintanance5
+        DISABLE_OPTIMIZATIONS = 1 << 0,
+        ALLOW_DERIVATIVES = 1 << 1,
+
+        // I can just derive this
+        //DERIVATIVE = 1<<2,
+
+        // Graphics Pipelines only
+        //VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3,
+
+        // Compute Pipelines only
+        //DISPATCH_BASE = 1<<4,
+
+        // This is for NV-raytracing extension. Now this is done via IDeferredOperation
+        //DEFER_COMPILE_NV = 1<<5,
+
+        // We use Renderdoc to take care of this for us,
+        // we won't be parsing the statistics and internal representation ourselves.
+        //CAPTURE_STATISTICS = 1<<6,
+        //CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7,
+
+        // Will soon be deprecated due to
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/854
+        FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1 << 8,
+        EARLY_RETURN_ON_FAILURE = 1 << 9,
+
+        // Will be exposed later with the IPipelineLibrary asset implementation
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //LINK_TIME_OPTIMIZATION = 1<<10,
+
+        // Won't be exposed because we'll introduce Libraries as a separate object/asset-type
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //CREATE_LIBRARY = 1<<11,
+
+        // Ray Tracing Pipelines only
+        //SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+        //SKIP_AABBS = 1<<13,
+        //NO_NULL_ANY_HIT_SHADERS = 1<<14,
+        //NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+        //NO_NULL_MISS_SHADERS = 1<<16,
+        //NO_NULL_INTERSECTION_SHADERS = 1<<17,
+
+        // There is a new Device Generated Commands extension with its own flag that will deprecate this
+        //INDIRECT_BINDABLE_NV = 1<<18,
+
+        // Ray Tracing Pipelines only
+        // For debug tools
+        //RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19,
+
+        // Ray Tracing Pipelines only
+        //ALLOW_MOTION = 1<<20,
+
+        // Graphics Pipelineonly (we don't support subpass shading)
+        //RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21,
+        //RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22,
+
+        // Will be exposed later with the IPipelineLibrary asset implementation
+        // https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
+        //RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23,
+
+        // Ray Tracing Pipelines only
+        //RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24,
+
+        // Not supported yet, and we will move to dynamic rendering, so this might never be supported
+        //COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25,
+        //DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26,
+
+        // Not Supported Yet
+        //NO_PROTECTED_ACCESS=1<<27,
+        //RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28,
+        //DESCRIPTOR_VUFFER_BIT=1<<29,
+        //PROTECTED_ACCESS_ONLY=1<<30,
+      };
+      using FLAGS = CreationFlags;
+
+      // Nabla requires device's reported subgroup size to be between 4 and 128
+      enum class SUBGROUP_SIZE : uint8_t
+      {
+        // No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
+        UNKNOWN = 0,
+        // Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
+        VARYING = 1,
+        // The rest we encode as log2(x) of the required value
+        REQUIRE_4 = 2,
+        REQUIRE_8 = 3,
+        REQUIRE_16 = 4,
+        REQUIRE_32 = 5,
+        REQUIRE_64 = 6,
+        REQUIRE_128 = 7
+      };
 
-					// Graphics Pipelines only
-					//VIEW_INDEX_FROM_DEVICE_INDEX = 1<<3,
-					
-					// Compute Pipelines only
-					//DISPATCH_BASE = 1<<4,
-					
-					// This is for NV-raytracing extension. Now this is done via IDeferredOperation
-					//DEFER_COMPILE_NV = 1<<5,
-
-					// We use Renderdoc to take care of this for us,
-					// we won't be parsing the statistics and internal representation ourselves.
-					//CAPTURE_STATISTICS = 1<<6,
-					//CAPTURE_INTERNAL_REPRESENTATIONS = 1<<7,
-
-					// Will soon be deprecated due to
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/854
-					FAIL_ON_PIPELINE_COMPILE_REQUIRED = 1<<8,
-					EARLY_RETURN_ON_FAILURE = 1<<9,
-
-					// Will be exposed later with the IPipelineLibrary asset implementation
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//LINK_TIME_OPTIMIZATION = 1<<10,
-
-					// Won't be exposed because we'll introduce Libraries as a separate object/asset-type
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//CREATE_LIBRARY = 1<<11,
-
-					// Ray Tracing Pipelines only
-					//SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-					//SKIP_AABBS = 1<<13,
-					//NO_NULL_ANY_HIT_SHADERS = 1<<14,
-					//NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-					//NO_NULL_MISS_SHADERS = 1<<16,
-					//NO_NULL_INTERSECTION_SHADERS = 1<<17,
-
-					// There is a new Device Generated Commands extension with its own flag that will deprecate this
-					//INDIRECT_BINDABLE_NV = 1<<18,
-
-					// Ray Tracing Pipelines only
-          // For debug tools
-					//RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR = 1<<19,
-
-					// Ray Tracing Pipelines only
-					//ALLOW_MOTION = 1<<20,
-
-					// Graphics Pipelineonly (we don't support subpass shading)
-					//RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR = 1<<21,
-					//RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT = 1<<22,
-
-					// Will be exposed later with the IPipelineLibrary asset implementation
-					// https://github.com/Devsh-Graphics-Programming/Nabla/issues/853
-					//RETAIN_LINK_TIME_OPTIMIZATION_INFO = 1<<23,
-
-					// Ray Tracing Pipelines only
-					//RAY_TRACING_OPACITY_MICROMAP_BIT_EXT = 1<<24,
-
-					// Not supported yet, and we will move to dynamic rendering, so this might never be supported
-					//COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<25,
-					//DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT = 1<<26,
-
-					// Not Supported Yet
-					//NO_PROTECTED_ACCESS=1<<27,
-					//RAY_TRACING_DISPLACEMENT_MICROMAP_BIT_NV = 1<<28,
-					//DESCRIPTOR_VUFFER_BIT=1<<29,
-					//PROTECTED_ACCESS_ONLY=1<<30,
-				};
-		};
-		
-		/*
-			Specialization info contains things such as entry point to a shader,
-			specialization map entry, required subgroup size, etc. for a blob of SPIR-V
-
-			It also handles Specialization Constants.
-
-			In Vulkan, all shaders get halfway-compiled into SPIR-V and
-			then then lowered (compiled) into the HW ISA by the Vulkan driver.
-			Normally, the half-way compile folds all constant values
-			and optimizes the code that uses them.
-
-			But, it would be nice every so often to have your Vulkan
-			program sneak into the halfway-compiled SPIR-V binary and
-			manipulate some constants at runtime. This is what
-			Specialization Constants are for.
-
-			So A Specialization Constant is a way of injecting an integer
-			constant into a halfway-compiled version of a shader right
-			before the lowering and linking when creating a pipeline.
-
-			Without Specialization Constants, you would have to commit
-			to a final value before the SPIR-V compilation
-		*/
-		struct SShaderSpecInfo final
-		{
-			//! Structure specifying a specialization map entry
-			/*
-				Note that if specialization constant ID is used
-				in a shader, \bsize\b and \boffset'b must match 
-				to \isuch an ID\i accordingly.
-
-				By design the API satisfies:
-				https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
-				https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
-			*/
-			//!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
-			using spec_constant_id_t = uint32_t;
-			struct SSpecConstantValue
-			{
-				const void* data = nullptr;
-				//!< The byte size of the specialization constant value within the supplied data buffer.
-				uint32_t size = 0;
-
-				inline operator bool() const {return data&&size;}
-				
-				auto operator<=>(const SSpecConstantValue&) const = default;
-			};
-			inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
-			{
-				if (!entries)
-					return { nullptr,0u };
-
-				const auto found = entries->find(_specConstID);
-				if (found != entries->end() && bool(found->second))
-					return found->second;
-				else
-					return { nullptr,0u };
-			}
-
-			// Nabla requires device's reported subgroup size to be between 4 and 128
-			enum class SUBGROUP_SIZE : uint8_t
-			{
-				// No constraint but probably means `gl_SubgroupSize` is Dynamically Uniform
-				UNKNOWN = 0,
-				// Allows the Subgroup Uniform `gl_SubgroupSize` to be non-Dynamically Uniform and vary between Device's min and max
-				VARYING = 1,
-				// The rest we encode as log2(x) of the required value
-				REQUIRE_4 = 2,
-				REQUIRE_8 = 3,
-				REQUIRE_16 = 4,
-				REQUIRE_32 = 5,
-				REQUIRE_64 = 6,
-				REQUIRE_128 = 7
-			};
-
-			//
-			static constexpr int32_t INVALID_SPEC_INFO = -1;
-			// Returns negative on failure, otherwise the size of the buffer required to reserve for the spec constant data 
-			inline int32_t valid() const
-			{
-				if (!shader || hlsl::bitCount(stage)!=1)
-					return INVALID_SPEC_INFO;
-
-				// Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
-				if (entryPoint.empty())
-					return INVALID_SPEC_INFO;
-
-				// Shader stages already checked for validity w.r.t. features enabled, during unspec shader creation, only check:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-08988
-				if (requireFullSubgroups)
-				switch (stage)
-				{
-					case hlsl::ShaderStage::ESS_COMPUTE: [[fallthrough]];
-					case hlsl::ShaderStage::ESS_TASK: [[fallthrough]];
-					case hlsl::ShaderStage::ESS_MESH:
-						break;
-					default:
-						return INVALID_SPEC_INFO;
-						break;
-				}
-				// Impossible to efficiently check anything from:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
-				// to:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
-				// and from:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
-				// to:
-				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
-					
-				int64_t specData = 0;
-				if (entries)
-				for (const auto& entry : *entries)
-				{
-					if (!entry.second)
-						return INVALID_SPEC_INFO;
-					specData += entry.second.size;
-				}
-				if (specData>0x7fffffff)
-					return INVALID_SPEC_INFO;
-				return static_cast<int32_t>(specData);
-			}
-
-			using spec_constant_map_t = core::unordered_map<spec_constant_id_t,SSpecConstantValue>;
-
-			const IShader* shader = nullptr;
-			// A name of the function where the entry point of an shader executable begins. It's often "main" function.
-			std::string_view entryPoint = {};
-			// stage must be set
-			hlsl::ShaderStage stage = hlsl::ShaderStage::ESS_UNKNOWN;
-			// there's some padding here
-			SUBGROUP_SIZE requiredSubgroupSize : 3 = SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
-			// Valid only for Compute, Mesh and Task shaders
-			uint8_t requireFullSubgroups : 1 = false;
-			// Container choice implicitly satisfies:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
-			const spec_constant_map_t* entries = nullptr;
-			// By requiring Nabla Core Profile features we implicitly satisfy:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
-			// Also because our API is sane, it satisfies the following by construction:
-			// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
-		};
 };
 template<typename PipelineLayout>
 class IPipeline : public IPipelineBase
 {
-	public:
-		// For now, due to API design we implicitly satisfy a bunch of VUIDs
-		struct SCreationParams : protected IPipelineBase::SCreationParams
-		{
-			public:
-				const PipelineLayout* layout = nullptr;
-		};
+    public:
+      inline const PipelineLayout* getLayout() const {return m_layout.get();}
 
-		inline const PipelineLayout* getLayout() const {return m_layout.get();}
+    protected:
 
-	protected:
-		inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout)
-      : m_layout(std::move(_layout)) {}
+      inline IPipeline(core::smart_refctd_ptr<PipelineLayout>&& _layout)
+        : m_layout(std::move(_layout)) {}
 
-		core::smart_refctd_ptr<const PipelineLayout> m_layout;
+      core::smart_refctd_ptr<PipelineLayout> m_layout;
 };
 
 }
diff --git a/include/nbl/asset/IPreHashed.h b/include/nbl/asset/IPreHashed.h
index 4bc5ca5dcd..f7252211e1 100644
--- a/include/nbl/asset/IPreHashed.h
+++ b/include/nbl/asset/IPreHashed.h
@@ -39,84 +39,61 @@ class IPreHashed : public IAsset
 				discardContent_impl();
 		}
 
-		static inline void discardDependantsContents(const std::span<IAsset*> roots)
-		{
-			struct stack_entry_t
-			{
-				IAsset* asset;
-				size_t childCount = 0;
-				size_t childrenVisited = 0;
-			};
-			core::stack<stack_entry_t> stack;
-			core::unordered_set<const IAsset*> alreadyVisited;
-			auto push = [&stack,&alreadyVisited](IAsset* node) -> void
-			{
-				if (!node)
-					return;
-				const auto [dummy,inserted] = alreadyVisited.insert(node);
-				if (inserted)
-					stack.push({.asset=node,.childCount=node->getDependantCount()});
-			};
-			for (const auto& root : roots)
-				push(root);
-			while (!stack.empty())
-			{
-				auto& entry = stack.top();
-				if (entry.childrenVisited<entry.childCount)
-				{
-					const auto dep = entry.asset->getDependant(entry.childrenVisited++);
-					push(dep);
-				}
-				else
-				{
-					// post order traversal does discard
-					auto* isPrehashed = dynamic_cast<IPreHashed*>(entry.asset);
-					if (isPrehashed)
-						isPrehashed->discardContent();
-					stack.pop();
-				}
-			}
-		}
-		static inline bool anyDependantDiscardedContents(const IAsset* root)
-		{
-			struct stack_entry_t
-			{
-				const IAsset* asset;
-				size_t childCount = 0;
-				size_t childrenVisited = 0;
-			};
-			core::stack<stack_entry_t> stack;
-			core::unordered_set<const IAsset*> alreadyVisited;
-			auto push = [&stack,&alreadyVisited](const IAsset* node) -> bool
-			{
-				if (!node)
-					return false;
-				const auto [dummy,inserted] = alreadyVisited.insert(node);
-				if (inserted)
-				{
-					auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
-					if (isPrehashed && isPrehashed->missingContent())
-						return true;
-					stack.push({.asset=node,.childCount=node->getDependantCount()});
-				}
-				return false;
-			};
-			if (push(root))
-				return true;
-			while (!stack.empty())
-			{
-				auto& entry = stack.top();
-				if (entry.childrenVisited<entry.childCount)
-				{
-					const auto dep = entry.asset->getDependant(entry.childrenVisited++);
-					if (push(dep))
-						return true;
-				}
-				else
-					stack.pop();
-			}
-			return false;
-		}
+    static inline void discardDependantsContents(const std::span<IAsset*> roots)
+    {
+      core::vector<IAsset*> stack;
+      core::unordered_set<IAsset*> alreadyVisited; // whether we have push the node to the stack
+      auto push = [&stack,&alreadyVisited](IAsset* node) -> bool
+      {
+        const auto [dummy,inserted] = alreadyVisited.insert(node);
+        if (inserted)
+          stack.push_back(node);
+        return true;
+      };
+      for (const auto& root : roots)
+        push(root);
+      while (!stack.empty())
+      {
+        auto* entry = stack.back();
+        stack.pop_back();
+        entry->visitDependents(push);
+        // pre order traversal does discard
+        auto* isPrehashed = dynamic_cast<IPreHashed*>(entry);
+        if (isPrehashed)
+          isPrehashed->discardContent();
+      }
+    }
+    static inline bool anyDependantDiscardedContents(const IAsset* root)
+    {
+      core::vector<const IAsset*> stack;
+      core::unordered_set<const IAsset*> alreadyVisited; // whether we have push the node to the stack
+      bool result = false;
+      auto push = [&stack,&alreadyVisited,&result](const IAsset* node) -> bool
+      {
+        const auto [dummy,inserted] = alreadyVisited.insert(node);
+        if (inserted)
+        {
+          auto* isPrehashed = dynamic_cast<const IPreHashed*>(node);
+          if (isPrehashed && isPrehashed->missingContent())
+          {
+            stack.clear();
+            result = true;
+            return false;
+          }
+          stack.push_back(node);
+        }
+        return true;
+      };
+      if (!push(root))
+        return true;
+      while (!stack.empty())
+      {
+        auto* entry = stack.back();
+        stack.pop_back();
+        entry->visitDependents(push);
+      }
+      return result;
+    }
 
 	protected:
 		inline IPreHashed() = default;
diff --git a/include/nbl/asset/IRayTracingPipeline.h b/include/nbl/asset/IRayTracingPipeline.h
index 0bc2d68653..b97d8d7002 100644
--- a/include/nbl/asset/IRayTracingPipeline.h
+++ b/include/nbl/asset/IRayTracingPipeline.h
@@ -14,35 +14,6 @@ namespace nbl::asset
 class IRayTracingPipelineBase : public virtual core::IReferenceCounted
 {
   public:
-    struct SShaderGroupsParams
-    {
-      struct SIndex
-      {
-        constexpr static inline uint32_t Unused = 0xffFFffFFu;
-        uint32_t index = Unused;
-      };
-
-      struct SHitGroup
-      {
-        uint32_t closestHit = SIndex::Unused;
-        uint32_t anyHit = SIndex::Unused;
-        uint32_t intersection = SIndex::Unused;
-      };
-
-      SIndex raygen;
-      std::span<SIndex> misses;
-      std::span<SHitGroup> hits;
-      std::span<SIndex> callables;
-
-      inline uint32_t getShaderGroupCount() const
-      {
-        return 1 + hits.size() + misses.size() + callables.size();
-      }
-
-    };
-    using SGeneralShaderGroup = SShaderGroupsParams::SIndex;
-    using SHitShaderGroup = SShaderGroupsParams::SHitGroup;
-
     struct SCachedCreationParams final
     {
       uint32_t maxRecursionDepth : 6 = 0;
@@ -53,152 +24,36 @@ class IRayTracingPipelineBase : public virtual core::IReferenceCounted
 template<typename PipelineLayoutType>
 class IRayTracingPipeline : public IPipeline<PipelineLayoutType>, public IRayTracingPipelineBase
 {
-    using base_creation_params_t = IPipeline<PipelineLayoutType>::SCreationParams;
   public:
 
-    using SGeneralShaderGroupContainer = core::smart_refctd_dynamic_array<SGeneralShaderGroup>;
-    using SHitShaderGroupContainer = core::smart_refctd_dynamic_array<SHitShaderGroup>;
-
-    struct SCreationParams : base_creation_params_t
+    #define base_flag(F) static_cast<uint64_t>(IPipelineBase::FLAGS::F)
+    enum class CreationFlags : uint64_t
     {
-      public:
-      #define base_flag(F) static_cast<uint64_t>(base_creation_params_t::FLAGS::F)
-      enum class FLAGS : uint64_t
-      {
-          NONE = base_flag(NONE),
-          DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
-          ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
-          FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
-          EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
-          SKIP_BUILT_IN_PRIMITIVES = 1<<12,
-          SKIP_AABBS = 1<<13,
-          NO_NULL_ANY_HIT_SHADERS = 1<<14,
-          NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
-          NO_NULL_MISS_SHADERS = 1<<16,
-          NO_NULL_INTERSECTION_SHADERS = 1<<17,
-          ALLOW_MOTION = 1<<20,
-      };
-      #undef base_flag
-
-      protected:
-        using SpecInfo = IPipelineBase::SShaderSpecInfo;
-        template<typename ExtraLambda>
-        inline bool impl_valid(ExtraLambda&& extra) const
-        {
-          if (!IPipeline<PipelineLayoutType>::SCreationParams::layout)
-            return false;
-
-          for (const auto info : shaders)
-          {
-            if (info.shader)
-            {
-              if (!extra(info))
-                return false;
-              const auto stage = info.stage;
-              if ((stage & ~IShader::E_SHADER_STAGE::ESS_ALL_RAY_TRACING) != 0)
-                return false;
-              if (!std::has_single_bit<std::underlying_type_t<IShader::E_SHADER_STAGE>>(stage))
-                return false;
-            }
-            else
-            {
-              // every shader must not be null. use SIndex::Unused to represent unused shader.
-              return false;
-            }
-          }
-
-          auto getShaderStage = [this](size_t index) -> IShader::E_SHADER_STAGE
-            {
-              return shaders[index].stage;
-            };
-
-          auto isValidShaderIndex = [this, getShaderStage](size_t index, IShader::E_SHADER_STAGE expectedStage, bool is_unused_shader_forbidden) -> bool
-            {
-              if (index == SShaderGroupsParams::SIndex::Unused)
-                return !is_unused_shader_forbidden;
-              if (index >= shaders.size())
-                return false;
-              if (getShaderStage(index) != expectedStage)
-                return false;
-              return true;
-            };
-
-          if (!isValidShaderIndex(shaderGroups.raygen.index, IShader::E_SHADER_STAGE::ESS_RAYGEN, true))
-          {
-            return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.hits)
-          {
-            // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
-            if (!isValidShaderIndex(shaderGroup.anyHit, 
-              IShader::E_SHADER_STAGE::ESS_ANY_HIT,
-              bool(flags & FLAGS::NO_NULL_ANY_HIT_SHADERS)))
-              return false;
-
-            // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
-            if (!isValidShaderIndex(shaderGroup.closestHit, 
-              IShader::E_SHADER_STAGE::ESS_CLOSEST_HIT,
-              bool(flags & FLAGS::NO_NULL_CLOSEST_HIT_SHADERS)))
-              return false;
-
-            if (!isValidShaderIndex(shaderGroup.intersection, 
-              IShader::E_SHADER_STAGE::ESS_INTERSECTION,
-              false))
-              return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.misses)
-          {
-            if (!isValidShaderIndex(shaderGroup.index, 
-              IShader::E_SHADER_STAGE::ESS_MISS, 
-              false))
-              return false;
-          }
-
-          for (const auto& shaderGroup : shaderGroups.callables)
-          {
-            if (!isValidShaderIndex(shaderGroup.index, IShader::E_SHADER_STAGE::ESS_CALLABLE, false))
-              return false;
-          }
-          return true;
-        }
-
-      public:
-        inline bool valid() const
-        {
-          return impl_valid([](const SpecInfo& info)->bool
-          {
-            if (!info.valid())
-              return false;
-            return false;
-          });
-        }
-
-        std::span<const SpecInfo> shaders = {};
-        SShaderGroupsParams shaderGroups;
-        SCachedCreationParams cached = {};
-        // TODO: Could guess the required flags from SPIR-V introspection of declared caps
-        core::bitflag<FLAGS> flags = FLAGS::NONE;
+      NONE = base_flag(NONE),
+      DISABLE_OPTIMIZATIONS = base_flag(DISABLE_OPTIMIZATIONS),
+      ALLOW_DERIVATIVES = base_flag(ALLOW_DERIVATIVES),
+      FAIL_ON_PIPELINE_COMPILE_REQUIRED = base_flag(FAIL_ON_PIPELINE_COMPILE_REQUIRED),
+      EARLY_RETURN_ON_FAILURE = base_flag(EARLY_RETURN_ON_FAILURE),
+      SKIP_BUILT_IN_PRIMITIVES = 1<<12,
+      SKIP_AABBS = 1<<13,
+      NO_NULL_ANY_HIT_SHADERS = 1<<14,
+      NO_NULL_CLOSEST_HIT_SHADERS = 1<<15,
+      NO_NULL_MISS_SHADERS = 1<<16,
+      NO_NULL_INTERSECTION_SHADERS = 1<<17,
+      ALLOW_MOTION = 1<<20,
     };
+    #undef base_flag
+    using FLAGS = CreationFlags;
 
     inline const SCachedCreationParams& getCachedCreationParams() const { return m_params; }
 
   protected:
-    explicit IRayTracingPipeline(const SCreationParams& _params) :
-      IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(_params.layout)),
-      m_params(_params.cached),
-      m_raygenShaderGroup(_params.shaderGroups.raygen),
-      m_missShaderGroups(core::make_refctd_dynamic_array<SGeneralShaderGroupContainer>(_params.shaderGroups.misses)),
-      m_hitShaderGroups(core::make_refctd_dynamic_array<SHitShaderGroupContainer>(_params.shaderGroups.hits)),
-      m_callableShaderGroups(core::make_refctd_dynamic_array<SGeneralShaderGroupContainer>(_params.shaderGroups.callables))
+    explicit IRayTracingPipeline(PipelineLayoutType* layout, const SCachedCreationParams& cachedParams) :
+        IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(layout)),
+        m_params(cachedParams)
     {}
 
     SCachedCreationParams m_params;
-    SGeneralShaderGroup m_raygenShaderGroup;
-    SGeneralShaderGroupContainer m_missShaderGroups;
-    SHitShaderGroupContainer m_hitShaderGroups;
-    SGeneralShaderGroupContainer m_callableShaderGroups;
 
 };
 
diff --git a/include/nbl/asset/IRenderpass.h b/include/nbl/asset/IRenderpass.h
index 30be5c99e7..ce41e35573 100644
--- a/include/nbl/asset/IRenderpass.h
+++ b/include/nbl/asset/IRenderpass.h
@@ -81,11 +81,12 @@ class NBL_API2 IRenderpass
                 {
                     bool valid() const;
                 };
+
                 // The arrays pointed to by this array must be terminated by `DepthStencilAttachmentsEnd` value, which implicitly satisfies a few VUIDs
-                constexpr static inline SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd = {};
+                static const SDepthStencilAttachmentDescription DepthStencilAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165
                 const SDepthStencilAttachmentDescription* depthStencilAttachments = &DepthStencilAttachmentsEnd;
                 // The arrays pointed to by this array must be terminated by `ColorAttachmentsEnd` value, which implicitly satisfies a few VUIDs
-                constexpr static inline SColorAttachmentDescription ColorAttachmentsEnd = {};
+                static const SColorAttachmentDescription ColorAttachmentsEnd; // have to initialize out of line because of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88165
                 const SColorAttachmentDescription* colorAttachments = &ColorAttachmentsEnd;
 
                 struct SSubpassDescription final
@@ -199,7 +200,7 @@ class NBL_API2 IRenderpass
                     SColorAttachmentsRef colorAttachments[MaxColorAttachments] = {};
 
                     // The arrays pointed to by this array must be terminated by `InputAttachmentsEnd` value
-                    constexpr static inline SInputAttachmentRef InputAttachmentsEnd = {};
+                    static const SInputAttachmentRef InputAttachmentsEnd;
                     const SInputAttachmentRef* inputAttachments = &InputAttachmentsEnd;
 
                     struct SPreserveAttachmentRef
@@ -232,7 +233,7 @@ class NBL_API2 IRenderpass
                     // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSubpassDescription2.html#VUID-VkSubpassDescription2-pipelineBindPoint-04953
                     //E_PIPELINE_BIND_POINT pipelineBindPoint : 2 = EPBP_GRAPHICS;
                 };
-                constexpr static inline SSubpassDescription SubpassesEnd = {};
+                static const SSubpassDescription SubpassesEnd;
                 const SSubpassDescription* subpasses = &SubpassesEnd;
 
                 struct SSubpassDependency final
@@ -258,7 +259,7 @@ class NBL_API2 IRenderpass
                     bool valid() const;
                 };
                 // The arrays pointed to by this array must be terminated by `DependenciesEnd` value
-                constexpr static inline SSubpassDependency DependenciesEnd = {};
+                static const SSubpassDependency DependenciesEnd;
                 const SSubpassDependency* dependencies = &DependenciesEnd;
 
 
@@ -379,6 +380,12 @@ class NBL_API2 IRenderpass
         uint32_t m_loadOpColorAttachmentEnd = ~0u;
 };
 
+constexpr inline IRenderpass::SCreationParams::SDepthStencilAttachmentDescription IRenderpass::SCreationParams::DepthStencilAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SColorAttachmentDescription IRenderpass::SCreationParams::ColorAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDescription::SInputAttachmentRef IRenderpass::SCreationParams::SSubpassDescription::InputAttachmentsEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDescription IRenderpass::SCreationParams::SubpassesEnd = {};
+constexpr inline IRenderpass::SCreationParams::SSubpassDependency IRenderpass::SCreationParams::DependenciesEnd = {};
+
 inline bool IRenderpass::compatible(const IRenderpass* other) const
 {
     // If you find yourself spending a lot of time here in your profile, go ahead and implement a precomputed hash and store it in the renderpass
@@ -707,7 +714,7 @@ inline bool IRenderpass::SCreationParams::SSubpassDescription::SDepthStencilAtta
 template<class attachment_ref_t>
 inline bool IRenderpass::SCreationParams::SSubpassDescription::SRenderAttachmentsRef<attachment_ref_t>::valid(const typename attachment_ref_t::description_t* descs, const uint32_t attachmentCount) const
 {
-    if (!render.valid<false>(descs,attachmentCount) || !resolve.valid<false>(descs,attachmentCount))
+    if (!render.template valid<false>(descs,attachmentCount) || !resolve.template valid<false>(descs,attachmentCount))
         return false;
     const bool renderUsed = render.used();
     if (resolve.used())
diff --git a/include/nbl/asset/IRenderpassIndependentPipeline.h b/include/nbl/asset/IRenderpassIndependentPipeline.h
index 7f33b6abc4..feeaff7c99 100644
--- a/include/nbl/asset/IRenderpassIndependentPipeline.h
+++ b/include/nbl/asset/IRenderpassIndependentPipeline.h
@@ -28,11 +28,6 @@ class IRenderpassIndependentPipeline
             SRasterizationParams rasterization = {};
             SBlendParams blend = {};
         };
-        struct SCreationParams
-        {
-            std::span<const IPipelineBase::SShaderSpecInfo> shaders = {};
-            SCachedCreationParams cached = {};
-        };
 
         inline const SCachedCreationParams& getCachedCreationParams() const {return m_cachedParams;}
 
diff --git a/include/nbl/asset/IShader.h b/include/nbl/asset/IShader.h
index a6dab09b54..96ff73f3f0 100644
--- a/include/nbl/asset/IShader.h
+++ b/include/nbl/asset/IShader.h
@@ -27,7 +27,7 @@ namespace nbl::asset
 	The purpose for the class is for storing raw HLSL code to be compiled
 	or already compiled (but unspecialized) SPIR-V code.
 */
-class IShader : public IAsset
+class IShader final : public IAsset
 {
 	public:
 		enum class E_CONTENT_TYPE : uint8_t
@@ -50,9 +50,6 @@ class IShader : public IAsset
 		constexpr static inline auto AssetType = ET_SHADER;
 		inline E_TYPE getAssetType() const override { return AssetType; }
 		
-		//
-		inline size_t getDependantCount() const override { return 1; }
-		
 		//
 		inline core::smart_refctd_ptr<IAsset> clone(uint32_t _depth=~0u) const override
 		{
@@ -90,17 +87,29 @@ class IShader : public IAsset
 
 		// TODO: `void setContent(core::smart_refctd_ptr<const ICPUBuffer>&&,const E_CONTENT_TYPE)`
 
+		inline bool valid() const override
+		{
+			if (!m_code) return false;
+			if (m_contentType == E_CONTENT_TYPE::ECT_UNKNOWN) return false;
+			return true;
+		}
+
 		// alias for legacy reasons
 		using E_SHADER_STAGE = hlsl::ShaderStage;
 
 	protected:
 		virtual ~IShader() = default;
 
-		inline IAsset* getDependant_impl(const size_t ix) override {return m_code.get();}
-
 		std::string m_filepathHint;
 		core::smart_refctd_ptr<ICPUBuffer> m_code;
 		E_CONTENT_TYPE m_contentType;
+
+  private:
+
+    inline void visitDependents_impl(std::function<bool(const IAsset*)> visit) const override
+    {
+        if (!visit(m_code.get())) return;
+    }
 };
 }
 
diff --git a/include/nbl/asset/filters/CBlitImageFilter.h b/include/nbl/asset/filters/CBlitImageFilter.h
index 1dbc7809ba..f228fea325 100644
--- a/include/nbl/asset/filters/CBlitImageFilter.h
+++ b/include/nbl/asset/filters/CBlitImageFilter.h
@@ -464,7 +464,7 @@ class CBlitImageFilter :
 
 			auto phaseCount = IBlitUtilities::getPhaseCount(inExtentLayerCount.xyz, outExtentLayerCount.xyz, inImageType);
 			phaseCount = hlsl::max(phaseCount,hlsl::uint32_t3(1,1,1));
-			const auto axisOffsets = blit_utils_t::template getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size);
+			const auto axisOffsets = blit_utils_t::getScaledKernelPhasedLUTAxisOffsets(phaseCount,real_window_size);
 			constexpr auto MaxAxisCount = 3;
 			lut_value_t* scaledKernelPhasedLUTPixel[MaxAxisCount];
 			for (auto i = 0; i < MaxAxisCount; ++i)
diff --git a/include/nbl/asset/filters/kernels/WeightFunctions.h b/include/nbl/asset/filters/kernels/WeightFunctions.h
index bb0b8fb9b4..af2782dfac 100644
--- a/include/nbl/asset/filters/kernels/WeightFunctions.h
+++ b/include/nbl/asset/filters/kernels/WeightFunctions.h
@@ -337,12 +337,12 @@ class CWeightFunction1D final : public impl::IWeightFunction1D<decltype(std::dec
 		inline void stretchAndScale(const float stretchFactor)
 		{
 			stretch(stretchFactor);
-			this->scale(base_t::value_t(1)/stretchFactor);
+			this->scale(typename base_t::value_t(1)/stretchFactor);
 		}
 
 		inline base_t::value_t weight(const float x) const
 		{
-			return static_cast<double>(this->getTotalScale()*function_t::weight<derivative>(x*this->getInvStretch()));
+			return static_cast<double>(this->getTotalScale()*function_t::template weight<derivative>(x*this->getInvStretch()));
 		}
 
 		// Integral of `weight(x) dx` from -INF to +INF
diff --git a/include/nbl/asset/utils/CSPIRVIntrospector.h b/include/nbl/asset/utils/CSPIRVIntrospector.h
index 3d6455e020..0d7d678549 100644
--- a/include/nbl/asset/utils/CSPIRVIntrospector.h
+++ b/include/nbl/asset/utils/CSPIRVIntrospector.h
@@ -208,7 +208,13 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						// `memberStrides[i]` only relevant if `memberTypes[i]->isArray()`
 						inline ptr_t<member_stride_t,Mutable> memberStrides() const {return memberOffsets()+memberCount;}
 						using member_matrix_info_t = MatrixInfo;
-						inline ptr_t<member_matrix_info_t,Mutable> memberMatrixInfos() const {return reinterpret_cast<ptr_t<member_matrix_info_t,Mutable>&>(memberStrides()+memberCount); }
+						inline ptr_t<member_matrix_info_t,Mutable> memberMatrixInfos() const 
+						{
+							auto t = memberStrides() + memberCount;
+
+							return reinterpret_cast<ptr_t<member_matrix_info_t,Mutable>&>(t);
+						
+						}
 
 						constexpr static inline size_t StoragePerMember = sizeof(member_type_t)+sizeof(member_name_t)+sizeof(member_size_t)+sizeof(member_offset_t)+sizeof(member_stride_t)+sizeof(member_matrix_info_t);
 
@@ -326,8 +332,8 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						template<bool C=!Mutable>
 						inline std::enable_if_t<C,bool> isLastMemberRuntimeSized() const
 						{
-							if (type->memberCount)
-								return type->memberTypes()[type->memberCount-1].count.front().isRuntimeSized();
+							if (this->type->memberCount)
+								return this->type->memberTypes()[this->type->memberCount-1].count.front().isRuntimeSized();
 							return false;
 						}
 						template<bool C=!Mutable>
@@ -335,9 +341,9 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 						{
 							if (isLastMemberRuntimeSized())
 							{
-								const auto& lastMember = type->memberTypes()[type->memberCount-1];
+								const auto& lastMember = this->type->memberTypes()[this->type->memberCount-1];
 								assert(!lastMember.count.front().isSpecConstantID);
-								return sizeWithoutLastMember+lastMemberElementCount*type->memberStrides()[type->memberCount-1];
+								return sizeWithoutLastMember+lastMemberElementCount* this->type->memberStrides()[this->type->memberCount-1];
 							}
 							return sizeWithoutLastMember;
 						}
@@ -582,7 +588,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 				}
 
 				// returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants)
-				bool merge(const CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr);
+				bool merge(const CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants=nullptr);
 
 				//
 				core::smart_refctd_dynamic_array<SPushConstantRange> createPushConstantRangesFromIntrospection(core::smart_refctd_ptr<const CStageIntrospectionData>& introspection);
@@ -643,7 +649,7 @@ class NBL_API2 CSPIRVIntrospector : public core::Uncopyable
 		}
 
 		//! creates pipeline for a single IShader
-		core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout=nullptr);
+		core::smart_refctd_ptr<ICPUComputePipeline> createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout=nullptr);
 
 #if 0 // wait until Renderpass Indep completely gone and Graphics Pipeline is used in a new way && Graphics Pipeline Libraries
 		struct CShaderStages
diff --git a/include/nbl/asset/utils/ISPIRVDebloater.h b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
similarity index 72%
rename from include/nbl/asset/utils/ISPIRVDebloater.h
rename to include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
index f5f87956be..a2e24dabab 100644
--- a/include/nbl/asset/utils/ISPIRVDebloater.h
+++ b/include/nbl/asset/utils/ISPIRVEntryPointTrimmer.h
@@ -1,5 +1,5 @@
-#ifndef _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_
-#define _NBL_ASSET_I_SPIRV_DEBLOATER_H_INCLUDED_
+#ifndef _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_
+#define _NBL_ASSET_I_SPIRV_ENTRY_POINT_TRIMMER_H_INCLUDED_
 
 #include "nbl/core/declarations.h"
 
@@ -10,14 +10,14 @@
 namespace nbl::asset
 {
 
-class ISPIRVDebloater final : public core::IReferenceCounted
+class ISPIRVEntryPointTrimmer final : public core::IReferenceCounted
 {
     public:
-        ISPIRVDebloater();
+        ISPIRVEntryPointTrimmer();
 
         struct Result
         {
-            core::smart_refctd_ptr<ICPUBuffer> spirv; // nullptr if there is some entry point not found or spirv does not need to be debloated
+            core::smart_refctd_ptr<ICPUBuffer> spirv; // nullptr if there is some entry point not found or spirv does not need to be trimmed
             bool isSuccess;
 
             inline operator bool() const
@@ -45,9 +45,9 @@ class ISPIRVDebloater final : public core::IReferenceCounted
             }
         };
 
-        Result debloat(const ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const;
+        Result trim(const ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const;
 
-        inline core::smart_refctd_ptr<const IShader> debloat(const IShader* shader, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const
+        inline core::smart_refctd_ptr<const IShader> trim(const IShader* shader, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger = nullptr) const
         {
             if (shader->getContentType() != IShader::E_CONTENT_TYPE::ECT_SPIRV)
             {
@@ -55,10 +55,10 @@ class ISPIRVDebloater final : public core::IReferenceCounted
                 return nullptr;
             }
             const auto buffer = shader->getContent();
-            const auto result = debloat(buffer, entryPoints, logger);
+            const auto result = trim(buffer, entryPoints, logger);
             if (result && result.spirv.get() == nullptr)
             {
-                // when debloat does not happen return original shader
+                // when trim does not happen return original shader
                 return core::smart_refctd_ptr<const IShader>(shader);
             }
 
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
index 262cb3c0c7..9088b0c7b4 100644
--- a/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
+++ b/include/nbl/builtin/hlsl/concepts/accessors/fft.hlsl
@@ -1,7 +1,7 @@
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_FFT_INCLUDED_
 
-#include "nbl/builtin/hlsl/concepts.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
 #include "nbl/builtin/hlsl/fft/common.hlsl"
 
 namespace nbl
@@ -17,49 +17,15 @@ namespace fft
 //      * void set(uint32_t index, in uint32_t value); 
 //      * void workgroupExecutionAndMemoryBarrier();
 
-#define NBL_CONCEPT_NAME FFTSharedMemoryAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, uint32_t)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<uint32_t, uint32_t>(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
-
+template<typename T, typename V=uint32_t, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
 
 // The Accessor (for a small FFT) MUST provide the following methods:
 //     * void get(uint32_t index, NBL_REF_ARG(complex_t<Scalar>) value);
 //     * void set(uint32_t index, in complex_t<Scalar> value);
 
-#define NBL_CONCEPT_NAME FFTAccessor
-#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)
-#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(Scalar)
-#define NBL_CONCEPT_PARAM_0 (accessor, T)
-#define NBL_CONCEPT_PARAM_1 (index, uint32_t)
-#define NBL_CONCEPT_PARAM_2 (val, complex_t<Scalar>)
-NBL_CONCEPT_BEGIN(3)
-#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
-#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
-#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
-NBL_CONCEPT_END(
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<complex_t<Scalar> >(index, val)), is_same_v, void))
-    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<complex_t<Scalar> >(index, val)), is_same_v, void))
-);
-#undef val
-#undef index
-#undef accessor
-#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+template<typename T, typename Scalar, typename I=uint32_t>
+NBL_BOOL_CONCEPT FFTAccessor = concepts::accessors::GenericDataAccessor<T,complex_t<Scalar>,I>;
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
new file mode 100644
index 0000000000..cc22595444
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl
@@ -0,0 +1,79 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_GENERIC_SHARED_DATA_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+namespace accessors
+{
+
+#define NBL_CONCEPT_NAME GenericSharedMemoryAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.workgroupExecutionAndMemoryBarrier()), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME GenericReadAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template get<V,I>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+#define NBL_CONCEPT_NAME GenericWriteAccessor
+#define NBL_CONCEPT_TPLT_PRM_KINDS (typename)(typename)(typename)
+#define NBL_CONCEPT_TPLT_PRM_NAMES (T)(V)(I)
+#define NBL_CONCEPT_PARAM_0 (accessor, T)
+#define NBL_CONCEPT_PARAM_1 (val, V)
+#define NBL_CONCEPT_PARAM_2 (index, I)
+NBL_CONCEPT_BEGIN(3)
+#define accessor NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_0
+#define val NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_1
+#define index NBL_CONCEPT_PARAM_T NBL_CONCEPT_PARAM_2
+NBL_CONCEPT_END(
+    ((NBL_CONCEPT_REQ_EXPR_RET_TYPE)((accessor.template set<V,I>(index, val)), is_same_v, void))
+);
+#undef val
+#undef index
+#undef accessor
+#include <nbl/builtin/hlsl/concepts/__end.hlsl>
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT GenericDataAccessor = GenericWriteAccessor<T,V,I> && GenericWriteAccessor<T,V,I>;
+
+}
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
new file mode 100644
index 0000000000..267342634f
--- /dev/null
+++ b/include/nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl
@@ -0,0 +1,26 @@
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_ACCESSORS_WORKGROUP_ARITHMETIC_INCLUDED_
+
+#include "nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticSharedMemoryAccessor = concepts::accessors::GenericSharedMemoryAccessor<T,V,I>;
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticReadOnlyDataAccessor = concepts::accessors::GenericReadAccessor<T,V,I>;
+
+template<typename T, typename V, typename I=uint32_t>
+NBL_BOOL_CONCEPT ArithmeticDataAccessor = concepts::accessors::GenericDataAccessor<T,V,I>;
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
index 679fecb697..431ea625bf 100644
--- a/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
+++ b/include/nbl/builtin/hlsl/cpp_compat/impl/intrinsics_impl.hlsl
@@ -602,7 +602,7 @@ struct nClamp_helper<T>
 	using return_t = T;
 	static inline return_t __call(const T x, const T _min, const T _max)
 	{
-		return nMin_helper::_call(nMax_helper::_call(x, _min), _max);
+		return nMin_helper<T>::_call(nMax_helper<T>::_call(x, _min), _max);
 	}
 };
 
diff --git a/include/nbl/builtin/hlsl/memory_accessor.hlsl b/include/nbl/builtin/hlsl/memory_accessor.hlsl
index 99ec0736a4..2194b1e917 100644
--- a/include/nbl/builtin/hlsl/memory_accessor.hlsl
+++ b/include/nbl/builtin/hlsl/memory_accessor.hlsl
@@ -112,8 +112,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
     BaseAccessor accessor;
 
     // Question: shall we go back to requiring a `access_t get(index_t)` on the `BaseAccessor`, then we could `enable_if` check the return type (via `has_method_get`) matches and we won't get Nasty HLSL copy-in copy-out conversions
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const index_t ix, NBL_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> get(const I ix, NBL_REF_ARG(T) value)
     {
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -123,8 +123,8 @@ struct StructureOfArrays : impl::StructureOfArraysBase<IndexType,ElementStride,S
         value = bit_cast<T,vector<access_t,SubElementCount> >(aux);
     }
 
-    template<typename T>
-    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const index_t ix, NBL_CONST_REF_ARG(T) value)
+    template<typename T, typename I=index_t>
+    enable_if_t<sizeof(T)%sizeof(access_t)==0,void> set(const I ix, NBL_CONST_REF_ARG(T) value)
     { 
         NBL_CONSTEXPR uint64_t SubElementCount = sizeof(T)/sizeof(access_t);
         // `vector` for now, we'll use `array` later when `bit_cast` gets fixed
@@ -209,11 +209,11 @@ struct Offset : impl::OffsetBase<IndexType,_Offset>
 
     BaseAccessor accessor;
 
-    template <typename T>
-    void set(index_t idx, T value) {accessor.set(idx+base_t::offset,value); }
+    template <typename T, typename I=index_t>
+    void set(I idx, T value) {accessor.set(idx+base_t::offset,value); }
 
-    template <typename T> 
-    void get(index_t idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
+    template <typename T, typename I=index_t> 
+    void get(I idx, NBL_REF_ARG(T) value) {accessor.get(idx+base_t::offset,value);}
     
     template<typename S=BaseAccessor>
     enable_if_t<
diff --git a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
index 724887b995..3b511126b4 100644
--- a/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
+++ b/include/nbl/builtin/hlsl/subgroup2/ballot.hlsl
@@ -4,6 +4,8 @@
 #ifndef _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SUBGROUP2_BALLOT_INCLUDED_
 
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_ballot.hlsl"
+
 namespace nbl 
 {
 namespace hlsl
@@ -11,6 +13,20 @@ namespace hlsl
 namespace subgroup2
 {
 
+template<int32_t AssumeAllActive=false>
+uint32_t LastSubgroupInvocation()
+{
+    if (AssumeAllActive)
+        return glsl::gl_SubgroupSize()-1;
+    else
+        return glsl::subgroupBallotFindMSB(glsl::subgroupBallot(true));
+}
+
+bool ElectLast()
+{
+    return glsl::gl_SubgroupInvocationID()==LastSubgroupInvocation();
+}
+
 template<uint32_t SubgroupSizeLog2>
 struct Configuration
 {
diff --git a/include/nbl/builtin/hlsl/tuple.hlsl b/include/nbl/builtin/hlsl/tuple.hlsl
new file mode 100644
index 0000000000..a9c26090ea
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tuple.hlsl
@@ -0,0 +1,61 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TUPLE_INCLUDED_
+
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<typename T0, typename T1=void, typename T2=void> // TODO: in the future use BOOST_PP to make this
+struct tuple
+{
+    T0 t0;
+    T1 t1;
+    T2 t2;
+};
+
+template<uint32_t N, typename Tuple>
+struct tuple_element;
+
+template<typename T0>
+struct tuple<T0,void,void>
+{
+   T0 t0;
+};
+
+template<typename T0, typename T1>
+struct tuple<T0,T1,void>
+{
+   T0 t0;
+   T1 t1;
+};
+// specializations for less and less void elements
+
+// base case
+template<typename Head, typename T1, typename T2>
+struct tuple_element<0,tuple<Head,T1,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename Head, typename T2>
+struct tuple_element<1,tuple<T0,Head,T2> >
+{
+   using type = Head;
+};
+
+template<typename T0, typename T1, typename Head>
+struct tuple_element<2,tuple<T0,T1,Head> >
+{
+   using type = Head;
+};
+
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
index 9aefc3b3d8..652cabd7c7 100644
--- a/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
+++ b/include/nbl/builtin/hlsl/vector_utils/vector_traits.hlsl
@@ -28,6 +28,7 @@ struct vector_traits<vector<T, DIMENSION> >\
     NBL_CONSTEXPR_STATIC_INLINE bool IsVector = true;\
 };\
 
+DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(1)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(2)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(3)
 DEFINE_VECTOR_TRAITS_TEMPLATE_SPECIALIZATION(4)
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
new file mode 100644
index 0000000000..62a9fb7bef
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic.hlsl
@@ -0,0 +1,63 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_INCLUDED_
+
+
+#include "nbl/builtin/hlsl/functional.hlsl"
+#include "nbl/builtin/hlsl/concepts/accessors/workgroup_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/shared_scan.hlsl"
+
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct reduction
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class ReadOnlyDataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticReadOnlyDataAccessor<ReadOnlyDataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static scalar_t __call(NBL_REF_ARG(ReadOnlyDataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::reduce<Config,BinOp,Config::LevelCount,device_capabilities> fn;
+        return fn.template __call<ReadOnlyDataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct inclusive_scan
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,false,Config::LevelCount,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+template<class Config, class BinOp, class device_capabilities=void NBL_PRIMARY_REQUIRES(is_configuration_v<Config>)
+struct exclusive_scan
+{
+    using scalar_t = typename BinOp::type_t;
+
+    template<class DataAccessor, class ScratchAccessor NBL_FUNC_REQUIRES(ArithmeticDataAccessor<DataAccessor,scalar_t> && ArithmeticSharedMemoryAccessor<ScratchAccessor,scalar_t>)
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        impl::scan<Config,BinOp,true,Config::LevelCount,device_capabilities> fn;
+        fn.template __call<DataAccessor,ScratchAccessor>(dataAccessor, scratchAccessor);
+    }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
new file mode 100644
index 0000000000..9a211899cb
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl
@@ -0,0 +1,225 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_ARITHMETIC_CONFIG_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/tuple.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
+
+namespace nbl 
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+namespace impl
+{
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2>
+struct virtual_wg_size_log2
+{
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef DEFINE_ASSIGN
+    
+    // must have at least enough level 0 outputs to feed a single subgroup
+    static_assert(WorkgroupSizeLog2>=SubgroupSizeLog2, "WorkgroupSize cannot be smaller than SubgroupSize");
+    static_assert(WorkgroupSizeLog2<=SubgroupSizeLog2*3+4, "WorkgroupSize cannot be larger than (SubgroupSize^3)*16");
+};
+
+template<class VirtualWorkgroup, uint16_t BaseItemsPerInvocation>
+struct items_per_invocation
+{
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define VIRTUAL_WG_SIZE VirtualWorkgroup::
+    #define MIN(TYPE,ARG1,ARG2) mpl::min_v<TYPE, ARG1, ARG2>
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef MIN
+    #undef VIRTUAL_WG_SIZE
+    #undef DEFINE_ASSIGN
+
+    using ItemsPerInvocation = tuple<integral_constant<uint16_t,value0>,integral_constant<uint16_t,value1>,integral_constant<uint16_t,value2> >;
+};
+}
+
+template<uint16_t _WorkgroupSizeLog2, uint16_t _SubgroupSizeLog2, uint16_t _ItemsPerInvocation>
+struct ArithmeticConfiguration
+{
+    using virtual_wg_t = impl::virtual_wg_size_log2<_WorkgroupSizeLog2, _SubgroupSizeLog2>;
+    using items_per_invoc_t = impl::items_per_invocation<virtual_wg_t, _ItemsPerInvocation>;
+    using ItemsPerInvocation = typename items_per_invoc_t::ItemsPerInvocation;
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) NBL_CONSTEXPR_STATIC_INLINE TYPE ID = __VA_ARGS__;
+    #define VIRTUAL_WG_SIZE virtual_wg_t::
+    #define ITEMS_PER_INVOC items_per_invoc_t::
+    #define MAX(TYPE,ARG1,ARG2) mpl::max_v<TYPE, ARG1, ARG2>
+    #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) conditional_value<COND,TYPE,TRUE_VAL,FALSE_VAL>::value
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef SELECT
+    #undef MAX
+    #undef ITEMS_PER_INVOC
+    #undef VIRTUAL_WG_SIZE
+    #undef DEFINE_ASSIGN
+
+    using ChannelStride = tuple<integral_constant<uint16_t,__padding>,integral_constant<uint16_t,__channelStride_1>,integral_constant<uint16_t,__channelStride_2> >; // we don't use stride 0
+
+    static_assert(VirtualWorkgroupSize<=WorkgroupSize*SubgroupSize);
+    static_assert(ItemsPerInvocation_2<=4, "4 level scan would have been needed with this config!");
+
+#ifdef __HLSL_VERSION
+    static bool electLast()
+    {
+        return glsl::gl_SubgroupInvocationID()==SubgroupSize-1;
+    }
+#endif
+
+    // gets a subgroupID as if each workgroup has (VirtualWorkgroupSize/SubgroupSize) subgroups
+    // each subgroup does work (VirtualWorkgroupSize/WorkgroupSize) times, the index denoted by workgroupInVirtualIndex
+    static uint16_t virtualSubgroupID(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
+    {
+        return workgroupInVirtualIndex * (WorkgroupSize >> SubgroupSizeLog2) + subgroupID;
+    }
+
+    // get a coalesced index to store for the next level in shared mem, e.g. level 0 -> level 1
+    // specify the next level to store values for in template param
+    // at level==LevelCount-1, it is guaranteed to have SubgroupSize elements
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndex(const uint16_t virtualSubgroupID)
+    {
+        const uint16_t ItemsPerNextInvocation = tuple_element<level,ItemsPerInvocation>::type::value;
+        const uint16_t outChannel = virtualSubgroupID & (ItemsPerNextInvocation-uint16_t(1u));
+        const uint16_t outInvocation = virtualSubgroupID / ItemsPerNextInvocation;
+        const uint16_t localOffset = outChannel * tuple_element<level,ChannelStride>::type::value + outInvocation;
+
+        if (level==2)
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
+            return baseOffset + localOffset;
+        }
+        else
+        {
+            const uint16_t paddingOffset = virtualSubgroupID / (SubgroupSize * ItemsPerInvocation_1);
+            return localOffset + paddingOffset;
+        }
+    }
+
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedStoreIndexFromVirtualIndex(const uint16_t subgroupID, const uint16_t workgroupInVirtualIndex)
+    {
+        const uint16_t virtualID = virtualSubgroupID(subgroupID, workgroupInVirtualIndex);
+        return sharedStoreIndex<level>(virtualID);
+    }
+
+    // get the coalesced index in shared mem at the current level
+    template<uint16_t level NBL_FUNC_REQUIRES(level>0 && level<LevelCount)
+    static uint16_t sharedLoadIndex(const uint16_t invocationIndex, const uint16_t component)
+    {
+        const uint16_t localOffset = component * tuple_element<level,ChannelStride>::type::value + invocationIndex;
+        const uint16_t paddingOffset = invocationIndex / SubgroupSize;
+
+        if (level==2)
+        {
+            const uint16_t baseOffset = LevelInputCount_1 + (SubgroupSize - uint16_t(1u)) * ItemsPerInvocation_1;
+            return baseOffset + localOffset + paddingOffset;
+        }
+        else
+            return localOffset + paddingOffset;
+    }
+};
+
+#ifndef __HLSL_VERSION
+namespace impl
+{
+struct SVirtualWGSizeLog2
+{
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2)
+    {
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/virtual_wg_size_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/virtual_wg_size_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+
+struct SItemsPerInvoc
+{
+    void init(const SVirtualWGSizeLog2 virtualWgSizeLog2, const uint16_t BaseItemsPerInvocation)
+    {
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define MIN(TYPE,ARG1,ARG2) hlsl::min<TYPE>(ARG1, ARG2)
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/items_per_invoc_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef MIN
+        #undef VIRTUAL_WG_SIZE
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/items_per_invoc_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+}
+
+struct SArithmeticConfiguration
+{
+    void init(const uint16_t _WorkgroupSizeLog2, const uint16_t _SubgroupSizeLog2, const uint16_t _ItemsPerInvocation)
+    {
+        impl::SVirtualWGSizeLog2 virtualWgSizeLog2;
+        virtualWgSizeLog2.init(_WorkgroupSizeLog2, _SubgroupSizeLog2);
+        impl::SItemsPerInvoc itemsPerInvoc;
+        itemsPerInvoc.init(virtualWgSizeLog2, _ItemsPerInvocation);
+
+        #define DEFINE_ASSIGN(TYPE,ID,...) ID = __VA_ARGS__;
+        #define VIRTUAL_WG_SIZE virtualWgSizeLog2.
+        #define ITEMS_PER_INVOC itemsPerInvoc.
+        #define MAX(TYPE,ARG1,ARG2) hlsl::max<TYPE>(ARG1, ARG2)
+        #define SELECT(TYPE,COND,TRUE_VAL,FALSE_VAL) (COND ? TRUE_VAL : FALSE_VAL)
+        #include "impl/arithmetic_config_def.hlsl"
+        #undef SELECT
+        #undef MAX
+        #undef ITEMS_PER_INVOC
+        #undef VIRTUAL_WG_SIZE
+        #undef DEFINE_ASSIGN
+    }
+
+    #define DEFINE_ASSIGN(TYPE,ID,...) TYPE ID;
+    #include "impl/arithmetic_config_def.hlsl"
+    #undef DEFINE_ASSIGN
+};
+#endif
+
+template<class T>
+struct is_configuration : bool_constant<false> {};
+
+template<uint16_t W, uint16_t S, uint16_t I>
+struct is_configuration<ArithmeticConfiguration<W,S,I> > : bool_constant<true> {};
+
+template<typename T>
+NBL_CONSTEXPR bool is_configuration_v = is_configuration<T>::value;
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
new file mode 100644
index 0000000000..94f54409db
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/arithmetic_config_def.hlsl
@@ -0,0 +1,34 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, WorkgroupSize, uint16_t(0x1u) << WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSize, uint16_t(0x1u) << SubgroupSizeLog2)
+
+DEFINE_ASSIGN(uint16_t, LevelCount, VIRTUAL_WG_SIZE levels)
+DEFINE_ASSIGN(uint16_t, VirtualWorkgroupSize, uint16_t(0x1u) << VIRTUAL_WG_SIZE value)
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_0, ITEMS_PER_INVOC value0)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_1, ITEMS_PER_INVOC value1)
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocation_2, ITEMS_PER_INVOC value2)
+
+DEFINE_ASSIGN(uint16_t, LevelInputCount_1, SELECT(uint16_t,(LevelCount==3),
+    MAX(uint16_t, (VirtualWorkgroupSize>>SubgroupSizeLog2), SubgroupSize),
+    SubgroupSize*ItemsPerInvocation_1))
+DEFINE_ASSIGN(uint16_t, LevelInputCount_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize*ItemsPerInvocation_2,0))
+DEFINE_ASSIGN(uint16_t, VirtualInvocationsAtLevel1, LevelInputCount_1 / ItemsPerInvocation_1)
+
+DEFINE_ASSIGN(uint16_t, __padding, SELECT(uint16_t,(LevelCount==3),SubgroupSize-1,0))
+DEFINE_ASSIGN(uint16_t, __channelStride_1, SELECT(uint16_t,(LevelCount==3),VirtualInvocationsAtLevel1,SubgroupSize) + __padding)
+DEFINE_ASSIGN(uint16_t, __channelStride_2, SELECT(uint16_t,(LevelCount==3),SubgroupSize,0))
+
+// user specified the shared mem size of Scalars
+DEFINE_ASSIGN(uint32_t, SharedScratchElementCount, SELECT(uint16_t,(LevelCount==1),
+    0,
+    SELECT(uint16_t,(LevelCount==3),
+        LevelInputCount_2+(SubgroupSize*ItemsPerInvocation_1)-1,
+        0
+        ) + LevelInputCount_1
+    ))
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
new file mode 100644
index 0000000000..c32d7ef8bd
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/items_per_invoc_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, ItemsPerInvocationProductLog2, MAX(int16_t,VIRTUAL_WG_SIZE WorkgroupSizeLog2-VIRTUAL_WG_SIZE SubgroupSizeLog2*VIRTUAL_WG_SIZE levels,0))
+DEFINE_ASSIGN(uint16_t, value0, BaseItemsPerInvocation)
+DEFINE_ASSIGN(uint16_t, value1, uint16_t(0x1u) << SELECT(uint16_t,(VIRTUAL_WG_SIZE levels==3),MIN(uint16_t,ItemsPerInvocationProductLog2,2),ItemsPerInvocationProductLog2))
+DEFINE_ASSIGN(uint16_t, value2, uint16_t(0x1u) << MAX(int16_t,ItemsPerInvocationProductLog2-2,0))
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
new file mode 100644
index 0000000000..e4c4047f1d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/impl/virtual_wg_size_def.hlsl
@@ -0,0 +1,8 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+DEFINE_ASSIGN(uint16_t, WorkgroupSizeLog2, _WorkgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, SubgroupSizeLog2, _SubgroupSizeLog2)
+DEFINE_ASSIGN(uint16_t, levels, SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2),SELECT(uint16_t,(_WorkgroupSizeLog2>_SubgroupSizeLog2*2+2),3,2),1))
+DEFINE_ASSIGN(uint16_t, value, MAX(uint16_t, _SubgroupSizeLog2*levels, _WorkgroupSizeLog2))
diff --git a/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
new file mode 100644
index 0000000000..5b19c55fbd
--- /dev/null
+++ b/include/nbl/builtin/hlsl/workgroup2/shared_scan.hlsl
@@ -0,0 +1,411 @@
+// Copyright (C) 2025 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+#define _NBL_BUILTIN_HLSL_WORKGROUP2_SHARED_SCAN_INCLUDED_
+
+#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/ballot.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/mpl.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace workgroup2
+{
+
+namespace impl
+{
+
+template<class Config, class BinOp, uint16_t LevelCount, class device_capabilities>
+struct reduce;
+
+template<class Config, class BinOp, bool Exclusive, uint16_t LevelCount, class device_capabilities>
+struct scan;
+
+// 1-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, should be NOOP accessor
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        subgroup2::reduction<params_t> reduction;
+        vector_t value;
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+        return reduction(value);
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 1, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    // doesn't use scratch smem, should be NOOP accessor
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        vector_t value;
+        dataAccessor.template get<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+        if (Exclusive)
+        {
+            subgroup2::exclusive_scan<params_t> excl_scan;
+            value = excl_scan(value);
+        }
+        else
+        {
+            subgroup2::inclusive_scan<params_t> incl_scan;
+            value = incl_scan(value);
+        }
+        dataAccessor.template set<vector_t, uint16_t>(uint16_t(glsl::gl_SubgroupInvocationID()), value);
+    }
+};
+
+// do level 0 scans for 2- and 3-level scans (same code)
+template<class Config, class BinOp, class device_capabilities>
+struct reduce_level0
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 0 scan
+        subgroup2::reduction<params_t> reduction0;
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_t scan_local;
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, scan_local);
+            scan_local = reduction0(scan_local);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, scan_local[Config::ItemsPerInvocation_0-1]);    // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    };
+};
+
+template<class Config, class BinOp, class device_capabilities>
+struct scan_level0
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+
+    template<class DataAccessor, class ScratchAccessor>
+    static void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_0, device_capabilities>;
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        subgroup2::inclusive_scan<params_t> inclusiveScan0;
+        // level 0 scan
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_t value;
+            dataAccessor.template get<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            value = inclusiveScan0(value);
+            dataAccessor.template set<vector_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()), idx);
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, value[Config::ItemsPerInvocation_0-1]);   // set last element of subgroup scan (reduction) to level 1 scan
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+    }
+};
+
+// 2-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::reduction<params_lv1_t> reduction1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = reduction1(lv1_val);
+
+            if (Config::electLast())
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv1_val[Config::ItemsPerInvocation_1-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t, uint32_t>(0,reduce_val);
+        return reduce_val;
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 2, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        BinOp binop;
+
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
+            scalar_t left = BinOp::identity;
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
+            }
+            else
+            {
+                [unroll]
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    value[i] = binop(left, value[i]);
+            }
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+        }
+    }
+};
+
+// 3-level scans
+template<class Config, class BinOp, class device_capabilities>
+struct reduce<Config, BinOp, 3, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    scalar_t __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
+        BinOp binop;
+
+        reduce_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::reduction<params_lv1_t> reduction1;
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = reduction1(lv1_val);
+            if (Config::electLast())
+            {
+                const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()));
+                scratchAccessor.template set<scalar_t, uint16_t>(bankedIndex, lv1_val[Config::ItemsPerInvocation_1-1]);
+            }
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 2 scan
+        subgroup2::reduction<params_lv2_t> reduction2;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            vector_lv2_t lv2_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+            lv2_val = reduction2(lv2_val);
+            if (Config::electLast())
+                scratchAccessor.template set<scalar_t, uint16_t>(0, lv2_val[Config::ItemsPerInvocation_2-1]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        scalar_t reduce_val;
+        scratchAccessor.template get<scalar_t, uint16_t>(0,reduce_val);
+        return reduce_val;
+    }
+};
+
+template<class Config, class BinOp, bool Exclusive, class device_capabilities>
+struct scan<Config, BinOp, Exclusive, 3, device_capabilities>
+{
+    using scalar_t = typename BinOp::type_t;
+    using vector_lv0_t = vector<scalar_t, Config::ItemsPerInvocation_0>;   // data accessor needs to be this type
+    using vector_lv1_t = vector<scalar_t, Config::ItemsPerInvocation_1>;
+    using vector_lv2_t = vector<scalar_t, Config::ItemsPerInvocation_2>;
+
+    template<class DataAccessor, class ScratchAccessor>
+    void __call(NBL_REF_ARG(DataAccessor) dataAccessor, NBL_REF_ARG(ScratchAccessor) scratchAccessor)
+    {
+        using config_t = subgroup2::Configuration<Config::SubgroupSizeLog2>;
+        using params_lv1_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_1, device_capabilities>;
+        using params_lv2_t = subgroup2::ArithmeticParams<config_t, BinOp, Config::ItemsPerInvocation_2, device_capabilities>;
+        BinOp binop;
+
+        scan_level0<Config, BinOp, device_capabilities>::template __call<DataAccessor, ScratchAccessor>(dataAccessor, scratchAccessor);
+
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        // level 1 scan
+        subgroup2::inclusive_scan<params_lv1_t> inclusiveScan1;
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+            lv1_val = inclusiveScan1(lv1_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i),lv1_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // level 2 scan
+        subgroup2::inclusive_scan<params_lv2_t> inclusiveScan2;
+        if (glsl::gl_SubgroupID() == 0)
+        {
+            const uint16_t lastChannel = Config::ItemsPerInvocation_1 - uint16_t(1u);
+            vector_lv2_t lv2_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+            {
+                const uint16_t inputSubgroupID = invocationIndex * Config::ItemsPerInvocation_2 + i;
+                const uint16_t inputSubgroupLastInvocation = inputSubgroupID * Config::SubgroupSize + (Config::SubgroupSize - uint16_t(1u));
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(inputSubgroupLastInvocation, lastChannel),lv2_val[i]);
+            }
+            lv2_val = inclusiveScan2(lv2_val);
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_2; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<2>(invocationIndex, i),lv2_val[i]);
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 1
+        if (glsl::gl_SubgroupID() < Config::LevelInputCount_2)
+        {
+            vector_lv1_t lv1_val;
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template get<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), lv1_val[i]);
+
+            scalar_t lv2_scan = BinOp::identity;
+            const uint16_t bankedIndex = Config::template sharedStoreIndex<2>(uint16_t(glsl::gl_SubgroupID()-1u));
+            if (glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex, lv2_scan);
+
+            [unroll]
+            for (uint16_t i = 0; i < Config::ItemsPerInvocation_1; i++)
+                scratchAccessor.template set<scalar_t, uint16_t>(Config::template sharedLoadIndex<1>(invocationIndex, i), binop(lv1_val[i],lv2_scan));
+        }
+        scratchAccessor.workgroupExecutionAndMemoryBarrier();
+
+        // combine with level 0
+        [unroll]
+        for (uint16_t idx = 0, virtualInvocationIndex = invocationIndex; idx < Config::VirtualWorkgroupSize / Config::WorkgroupSize; idx++)
+        {
+            vector_lv0_t value;
+            dataAccessor.template get<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+
+            const uint16_t bankedIndex = Config::template sharedStoreIndexFromVirtualIndex<1>(uint16_t(glsl::gl_SubgroupID()-1u), idx);
+            scalar_t left = BinOp::identity;
+            if (idx != 0 || glsl::gl_SubgroupID() != 0)
+                scratchAccessor.template get<scalar_t, uint16_t>(bankedIndex,left);
+            if (Exclusive)
+            {
+                scalar_t left_last_elem = hlsl::mix(BinOp::identity, glsl::subgroupShuffleUp<scalar_t>(value[Config::ItemsPerInvocation_0-1],1), bool(glsl::gl_SubgroupInvocationID()));
+                [unroll]
+                for (uint16_t i = Config::ItemsPerInvocation_0-1; i > 0; i--)
+                    value[i] = binop(left, value[i-1]);
+                value[0] = binop(left, left_last_elem);
+            }
+            else
+            {
+                [unroll]
+                for (uint16_t i = 0; i < Config::ItemsPerInvocation_0; i++)
+                    value[i] = binop(left, value[i]);
+            }
+            dataAccessor.template set<vector_lv0_t, uint16_t>(idx * Config::WorkgroupSize + virtualInvocationIndex, value);
+        }
+    }
+};
+
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 4e7147c904..1abebf23ea 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -40,7 +40,7 @@ struct ProtoPipeline final
 		inline operator bool() const {return m_vxShader.get();}
 
 		inline core::smart_refctd_ptr<video::IGPUGraphicsPipeline> createPipeline(
-			const asset::IPipelineBase::SShaderSpecInfo& fragShader,
+			const video::IGPUPipelineBase::SShaderSpecInfo& fragShader,
 			video::IGPUPipelineLayout* layout,
 			video::IGPURenderpass* renderpass,
 			const uint32_t subpassIx=0,
@@ -58,17 +58,13 @@ struct ProtoPipeline final
 			{
 				const auto orientationAsUint32 = static_cast<uint32_t>(swapchainTransform);
 
-        asset::IPipelineBase::SShaderSpecInfo::spec_constant_map_t specConstants;
-				specConstants[0] = {.data=&orientationAsUint32,.size=sizeof(orientationAsUint32)};
-
-				const asset::IPipelineBase::SShaderSpecInfo shaders[2] = {
-					{.shader=m_vxShader.get(), .entryPoint = "main" ,.stage = hlsl::ESS_VERTEX,.entries=&specConstants},
-					fragShader
-				};
+        IGPUPipelineBase::SShaderEntryMap specConstants;
+				specConstants[0] = std::span{ reinterpret_cast<const uint8_t*>(&orientationAsUint32), sizeof(orientationAsUint32)};
 
 				IGPUGraphicsPipeline::SCreationParams params[1];
 				params[0].layout = layout;
-				params[0].shaders = shaders;
+				params[0].vertexShader = { .shader = m_vxShader.get(), .entryPoint = "main", .entries = &specConstants };
+				params[0].fragmentShader = fragShader;
 				params[0].cached = {
 					.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 					.primitiveAssembly = {},
diff --git a/include/nbl/macros.h b/include/nbl/macros.h
index 4927f21899..fe93201a11 100644
--- a/include/nbl/macros.h
+++ b/include/nbl/macros.h
@@ -81,7 +81,7 @@
 
 //! Workarounds for compiler specific bugs
 // MSVC 2019 is a special snowflake
-#if defined(_MSC_VER) && _MSC_VER>=1920
+#if defined(_MSC_VER) && !defined(__clang__) && _MSC_VER>=1920
     #define NBL_TYPENAME_4_STTC_MBR typename
 #else
     #define NBL_TYPENAME_4_STTC_MBR
diff --git a/include/nbl/system/demote_promote_writer_readers_lock.h b/include/nbl/system/demote_promote_writer_readers_lock.h
index 6823c26c27..5447e65f3e 100644
--- a/include/nbl/system/demote_promote_writer_readers_lock.h
+++ b/include/nbl/system/demote_promote_writer_readers_lock.h
@@ -271,7 +271,7 @@ class demote_promote_writer_readers_lock_debug
 
 	struct DefaultPreemptionCheck
 	{
-		bool operator()(state_lock_value_t oldState)
+		bool operator()(const state_lock_value_t oldState)
 		{
 			return false;
 		}
@@ -361,13 +361,13 @@ class dpwr_lock_guard_base
 	/**
 	* @brief Checks whether this guard is currently locking the lock `lk`
 	*/
-	bool hasLocked(dpwr_lock_t& lk) const
+	bool hasLocked(const dpwr_lock_t& lk) const
 	{
 		return m_lock == &lk;
 	}
 
 protected:
-	dpwr_lock_guard_base(dpwr_lock_t& lk) noexcept : m_lock(&lk) {}
+	dpwr_lock_guard_base(const dpwr_lock_t& lk) noexcept : m_lock(&lk) {}
 
 	dpwr_lock_t* m_lock;
 };
@@ -385,7 +385,7 @@ class dpwr_read_lock_guard_debug : public impl::dpwr_lock_guard_base<DebugCallba
 public:
 	using dpwr_lock_t = demote_promote_writer_readers_lock_debug<DebugCallback>;
 	using dpwr_write_lock_guard_debug_t = dpwr_write_lock_guard_debug<DebugCallback>;
-	dpwr_read_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
+	dpwr_read_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
 	explicit dpwr_read_lock_guard_debug(dpwr_lock_t& lk) : dpwr_read_lock_guard_debug(lk, std::adopt_lock_t())
 	{
 		this->m_lock->read_lock();
@@ -406,7 +406,7 @@ class dpwr_write_lock_guard_debug : public impl::dpwr_lock_guard_base<DebugCallb
 public:
 	using dpwr_lock_t = demote_promote_writer_readers_lock_debug<DebugCallback>;
 	using dpwr_read_lock_guard_debug_t = dpwr_read_lock_guard_debug<DebugCallback>;
-	dpwr_write_lock_guard_debug(dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
+	dpwr_write_lock_guard_debug(const dpwr_lock_t& lk, std::adopt_lock_t) : base_t(lk) {}
 	explicit dpwr_write_lock_guard_debug(dpwr_lock_t& lk) : dpwr_write_lock_guard_debug(lk, std::adopt_lock_t())
 	{
 		this->m_lock->write_lock();
diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h
index c996000e04..e6d17ddf3e 100644
--- a/include/nbl/video/CVulkanDeviceMemoryBacked.h
+++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h
@@ -47,8 +47,8 @@ class CVulkanDeviceMemoryBacked : public Interface
 };
 
 #ifndef _NBL_VIDEO_C_VULKAN_DEVICE_MEMORY_BACKED_CPP_
-extern template CVulkanDeviceMemoryBacked<IGPUBuffer>;
-extern template CVulkanDeviceMemoryBacked<IGPUImage>;
+extern template class CVulkanDeviceMemoryBacked<IGPUBuffer>;
+extern template class CVulkanDeviceMemoryBacked<IGPUImage>;
 #endif
 
 } // end namespace nbl::video
diff --git a/include/nbl/video/CVulkanRayTracingPipeline.h b/include/nbl/video/CVulkanRayTracingPipeline.h
index 82d8c777b6..a9bc476f43 100644
--- a/include/nbl/video/CVulkanRayTracingPipeline.h
+++ b/include/nbl/video/CVulkanRayTracingPipeline.h
@@ -41,10 +41,13 @@ class CVulkanRayTracingPipeline final : public IGPURayTracingPipeline
 
     const VkPipeline m_vkPipeline;
     ShaderGroupHandleContainer m_shaderGroupHandles;
-    uint16_t m_raygenStackSize;
     core::smart_refctd_dynamic_array<uint16_t> m_missStackSizes;
     core::smart_refctd_dynamic_array<SHitGroupStackSize> m_hitGroupStackSizes;
     core::smart_refctd_dynamic_array<uint16_t> m_callableStackSizes;
+    uint32_t m_missGroupCount;
+    uint32_t m_hitGroupCount;
+    uint32_t m_callableGroupCount;
+    uint16_t m_raygenStackSize;
 
     uint32_t getRaygenIndex() const;
     uint32_t getMissBaseIndex() const;
diff --git a/include/nbl/video/IGPUAccelerationStructure.h b/include/nbl/video/IGPUAccelerationStructure.h
index 5d8f0ca29b..1bb4fb0c66 100644
--- a/include/nbl/video/IGPUAccelerationStructure.h
+++ b/include/nbl/video/IGPUAccelerationStructure.h
@@ -45,7 +45,7 @@ class IGPUAccelerationStructure : public IBackendObject
 #endif
 
 		//! builds
-		template<class BufferType>
+		template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo
 		{
 			public:
@@ -98,39 +98,6 @@ class IGPUAccelerationStructure : public IBackendObject
 				}
 		};
 
-		// copies
-		enum class COPY_MODE : uint8_t
-		{
-			CLONE = 0,
-			COMPACT = 1,
-			SERIALIZE = 2,
-			DESERIALIZE = 3,
-		};
-		struct CopyInfo
-		{
-			const IGPUAccelerationStructure* src = nullptr;
-			IGPUAccelerationStructure* dst = nullptr;
-			COPY_MODE mode = COPY_MODE::CLONE;
-		};
-		template<typename BufferType>
-		struct CopyToMemoryInfo
-		{
-			const IGPUAccelerationStructure* src = nullptr;
-			asset::SBufferBinding<BufferType> dst = nullptr;
-			COPY_MODE mode = COPY_MODE::SERIALIZE;
-		};
-		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
-		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
-		template<typename BufferType>
-		struct CopyFromMemoryInfo
-		{
-			asset::SBufferBinding<const BufferType> src = nullptr;
-			IGPUAccelerationStructure* dst = nullptr;
-			COPY_MODE mode = COPY_MODE::DESERIALIZE;
-		};
-		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
-		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
-
 		// this will return false also if your deferred operation is not ready yet, so please use in combination with `isPending()`
 		virtual bool wasCopySuccessful(const IDeferredOperation* const deferredOp) = 0;
 
@@ -176,12 +143,36 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 
 		inline bool usesMotion() const override {return m_params.flags.hasFlags(SCreationParams::FLAGS::MOTION_BIT);}
 
+		// copies
+		struct CopyInfo
+		{
+			const IGPUBottomLevelAccelerationStructure* src = nullptr;
+			IGPUAccelerationStructure* dst = nullptr;
+			bool compact = false;
+		};
+		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyToMemoryInfo
+		{
+			const IGPUBottomLevelAccelerationStructure* src = nullptr;
+			asset::SBufferBinding<BufferType> dst = nullptr;
+		};
+		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
+		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyFromMemoryInfo
+		{
+			asset::SBufferBinding<const BufferType> src = nullptr;
+			IGPUBottomLevelAccelerationStructure* dst = nullptr;
+		};
+		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
+		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
+
 		// read the comments in the .hlsl file, AABB builds ignore certain fields
-		using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo;
+		using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo; // TODO: rename to GeometryRangeInfo, and make `BuildRangeInfo = const GeometryRangeInfo*`
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo* const*;
 		using MaxInputCounts = const uint32_t* const;
 
-		template<class BufferType>
+		template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo final : IGPUAccelerationStructure::BuildInfo<BufferType>
 		{
 			private:
@@ -203,7 +194,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 				NBL_API2 uint32_t valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const;
 
 				// really expensive to call, `valid` only calls it when `_NBL_DEBUG` is defined
-				inline bool validGeometry(size_t& totalPrims, const AABBs<const BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
+				inline bool validGeometry(size_t& totalPrims, const AABBs<BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
 				{
 					constexpr size_t AABBalignment = 8ull;
 					// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkAccelerationStructureBuildRangeInfoKHR-primitiveOffset-03659
@@ -222,7 +213,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 					totalPrims += buildRangeInfo.primitiveCount;
 					return true;
 				}
-				inline bool validGeometry(size_t& totalPrims, const Triangles<const BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
+				inline bool validGeometry(size_t& totalPrims, const Triangles<BufferType>& geometry, const BuildRangeInfo& buildRangeInfo) const
 				{
 					//
 					if (!dstAS->validVertexFormat(geometry.vertexFormat))
@@ -306,7 +297,7 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 						*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(srcAS);
 					*(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(dstAS);
 
-					if (buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+					if (buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 					{
 						for (auto i=0u; i<geometryCount; i++)
 							*(oit++) = aabbs[i].data.buffer;
@@ -337,8 +328,8 @@ class IGPUBottomLevelAccelerationStructure : public asset::IBottomLevelAccelerat
 				// please interpret based on `buildFlags.hasFlags(GEOMETRY_TYPE_IS_AABB_BIT)`
 				union
 				{
-					const Triangles<const BufferType>* triangles = nullptr;
-					const AABBs<const BufferType>* aabbs;
+					const Triangles<BufferType>* triangles = nullptr;
+					const AABBs<BufferType>* aabbs;
 				};
 		};
 		using DeviceBuildInfo = BuildInfo<IGPUBuffer>;
@@ -388,12 +379,43 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 		//
 		inline uint32_t getMaxInstanceCount() const {return m_maxInstanceCount;}
 
+		//
+		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
+
+		// copies
+		struct CopyInfo
+		{
+			const IGPUTopLevelAccelerationStructure* src = nullptr;
+			IGPUTopLevelAccelerationStructure* dst = nullptr;
+			bool compact = false;
+		};
+		template<typename BufferType>  requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyToMemoryInfo
+		{
+			const IGPUTopLevelAccelerationStructure* src = nullptr;
+			asset::SBufferBinding<BufferType> dst = nullptr;
+			// [optional] Query the tracked BLASes
+			core::smart_refctd_dynamic_array<blas_smart_ptr_t> trackedBLASes = nullptr;
+		};
+		using DeviceCopyToMemoryInfo = CopyToMemoryInfo<IGPUBuffer>;
+		using HostCopyToMemoryInfo = CopyToMemoryInfo<asset::ICPUBuffer>;
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
+		struct CopyFromMemoryInfo
+		{
+			asset::SBufferBinding<const BufferType> src = nullptr;
+			IGPUTopLevelAccelerationStructure* dst = nullptr;
+			// [optional] Provide info about what BLAS references to hold onto after the copy. For performance make sure the list is compact (without repeated elements).
+			std::span<const IGPUBottomLevelAccelerationStructure*> trackedBLASes = {};
+		};
+		using DeviceCopyFromMemoryInfo = CopyFromMemoryInfo<IGPUBuffer>;
+		using HostCopyFromMemoryInfo = CopyFromMemoryInfo<asset::ICPUBuffer>;
+
 		// read the comments in the .hlsl file
 		using BuildRangeInfo = hlsl::acceleration_structures::top_level::BuildRangeInfo;
 		using DirectBuildRangeRangeInfos = const BuildRangeInfo*;
 		using MaxInputCounts = const uint32_t;
 
-		template<typename BufferType>
+		template<typename BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 		struct BuildInfo final : IGPUAccelerationStructure::BuildInfo<BufferType>
 		{
 			private:
@@ -638,6 +660,9 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 				// I don't do an actual union because the preceeding members don't play nicely with alignment of `core::matrix3x4SIMD` and Vulkan requires this struct to be packed
 				SRTMotionInstance<blas_ref_t> largestUnionMember = {};
 				static_assert(alignof(SRTMotionInstance<blas_ref_t>)==8ull);
+
+			public:
+				constexpr static inline size_t LargestUnionMemberSize = sizeof(largestUnionMember);
 		};
 		using DevicePolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::device_op_ref_t>;
 		using HostPolymorphicInstance = PolymorphicInstance<IGPUBottomLevelAccelerationStructure::host_op_ref_t>;
@@ -664,69 +689,108 @@ class IGPUTopLevelAccelerationStructure : public asset::ITopLevelAccelerationStr
 
 		//
 		using build_ver_t = uint32_t;
+		//
+		inline build_ver_t getPendingBuildVer() const {return m_pendingBuildVer;}
 		// this gets called when execution is sure to happen 100%, e.g. not during command recording but during submission
 		inline build_ver_t registerNextBuildVer()
 		{
-			return m_pendingBuildVer++;
+			return ++m_pendingBuildVer;
 		}
-		// 
-		using blas_smart_ptr_t = core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>;
 		// returns number of tracked BLASes if `tracked==nullptr` otherwise writes `*count` tracked BLASes from `first` into `*tracked`
-		inline build_ver_t getTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const uint32_t first=0) const
+		inline void getPendingBuildTrackedBLASes(uint32_t* count, blas_smart_ptr_t* tracked, const build_ver_t buildVer) const
 		{
 			if (!count)
-				return 0;
+				return;
 			// stop multiple threads messing with us
 			std::lock_guard lk(m_trackingLock);
-			const uint32_t toWrite = std::min<uint32_t>(std::max<uint32_t>(m_trackedBLASes.size(),first)-first,tracked ? (*count):0xffFFffFFu);
-			*count = toWrite;
-			if (tracked && toWrite)
-			{
-				auto it = m_trackedBLASes.begin();
-				// cmon its an unordered map, iterator should have operator +=
-				for (auto i=0; i<first; i++)
-					it++;
-				for (auto i=0; i<toWrite; i++)
-					*(tracked++) = *(it++);
-			}
-			return m_completedBuildVer;
+			auto pBLASes = getPendingBuildTrackedBLASes(buildVer);
+			const auto origCount = *count;
+			*count = pBLASes ? pBLASes->size():0;
+			if (!tracked || !pBLASes)
+				return;
+			auto it = pBLASes->begin();
+			for (auto i = 0; i<origCount; i++)
+				*(tracked++) = *(it++);
 		}
-		// Useful if TLAS got built externally as well, returns if there were no later builds that preempted us setting the result here
+		// Useful if TLAS got built externally as well
 		template<typename Iterator>
-		inline bool setTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer)
+		inline void insertTrackedBLASes(const Iterator begin, const Iterator end, const build_ver_t buildVer)
 		{
+			if (buildVer==0)
+				return;
 			// stop multiple threads messing with us
 			std::lock_guard lk(m_trackingLock);
-			// stop out of order callbacks
-			if (buildVer<=m_completedBuildVer)
-				return false;
-			m_completedBuildVer = buildVer;
-			// release already tracked BLASes
-			m_trackedBLASes.clear();
-			// sanity check, TODO: this should be an atomic_max on the `m_pendingBuildVer`
-			if (m_completedBuildVer>m_pendingBuildVer)
-				m_pendingBuildVer = m_completedBuildVer;
+			// insert in the right order
+			auto prev = m_pendingBuilds.before_begin();
+			for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>buildVer; prev=it++) {}
+			auto inserted = m_pendingBuilds.emplace_after(prev);
 			// now fill the contents
-			m_trackedBLASes.insert(begin,end);
-			return true;
+			inserted->BLASes.insert(begin,end);
+			inserted->ordinal = buildVer;
 		}
-		// a little utility to make sure nothing from this build version and before gets tracked
-		inline bool clearTrackedBLASes(const build_ver_t buildVer)
+		template<typename Iterator>
+		inline build_ver_t pushTrackedBLASes(const Iterator begin, const Iterator end)
+		{
+			const auto buildVer = registerNextBuildVer();
+			insertTrackedBLASes<Iterator>(begin,end,buildVer);
+			return buildVer;
+		}
+		// a little utility to make sure nothing from before this build version gets tracked
+		inline void clearTrackedBLASes(const build_ver_t buildVer)
 		{
-			return setTrackedBLASes<const blas_smart_ptr_t*>(nullptr,nullptr,buildVer);
+			// stop multiple threads messing with us
+			std::lock_guard lk(m_trackingLock);
+			clearTrackedBLASes_impl(buildVer);
 		}
 
 	protected:
 		inline IGPUTopLevelAccelerationStructure(core::smart_refctd_ptr<const ILogicalDevice>&& dev, SCreationParams&& params)
 			: Base(), IGPUAccelerationStructure(std::move(dev),std::move(params)),
-			m_maxInstanceCount(params.maxInstanceCount),m_trackedBLASes() {}
-
+			m_maxInstanceCount(params.maxInstanceCount) {}
 		const uint32_t m_maxInstanceCount;
+
+	private:
+		struct DynamicUpCastingSpanIterator
+		{
+			inline bool operator!=(const DynamicUpCastingSpanIterator& other) const {return ptr!=other.ptr;}
+
+			inline DynamicUpCastingSpanIterator operator++() {return {ptr++};}
+
+			inline const IGPUBottomLevelAccelerationStructure* operator*() const {return dynamic_cast<const IGPUBottomLevelAccelerationStructure*>(ptr->get());}
+
+			std::span<const core::smart_refctd_ptr<const core::IReferenceCounted>>::iterator ptr;
+		};
+		friend class ILogicalDevice;
+		friend class IQueue;
+		inline const core::unordered_set<blas_smart_ptr_t>* getPendingBuildTrackedBLASes(const build_ver_t buildVer) const
+		{
+			const auto found = std::find_if(m_pendingBuilds.begin(),m_pendingBuilds.end(),[buildVer](const auto& item)->bool{return item.ordinal==buildVer;});
+			if (found==m_pendingBuilds.end())
+				return nullptr;
+			return &found->BLASes;
+		}
+		inline void clearTrackedBLASes_impl(const build_ver_t buildVer)
+		{
+			// find first element less or equal to `buildVer`
+			auto prev = m_pendingBuilds.before_begin();
+			for (auto it=std::next(prev); it!=m_pendingBuilds.end()&&it->ordinal>=buildVer; prev=it++) {}
+			m_pendingBuilds.erase_after(prev,m_pendingBuilds.end());
+		}
+
+		std::atomic<build_ver_t> m_pendingBuildVer = 0;
 		// TODO: maybe replace with new readers/writers lock
 		mutable std::mutex m_trackingLock;
-		std::atomic<build_ver_t> m_pendingBuildVer = 0;
-		build_ver_t m_completedBuildVer = 0;
-		core::unordered_set<blas_smart_ptr_t> m_trackedBLASes;
+		// TODO: this definitely needs improving with MultiEventTimelines (which also can track deferred Host ops) but then one needs to track semaphore signal-wait deps so we know what "state copy" a compaction wants
+		// Deferred Op must complete AFTER a submit, otherwise race condition.
+		// If we make a linked list of pending builds, then we just need to pop completed builds (traverse until current found)
+		struct STrackingInfo
+		{
+			core::unordered_set<blas_smart_ptr_t> BLASes;
+			// when the build got 
+			build_ver_t ordinal;
+		};
+		// a little misleading, the element is the most recently completed one
+		core::forward_list<STrackingInfo> m_pendingBuilds;
 };
 
 }
diff --git a/include/nbl/video/IGPUCommandBuffer.h b/include/nbl/video/IGPUCommandBuffer.h
index f79ed17a50..bb6460754a 100644
--- a/include/nbl/video/IGPUCommandBuffer.h
+++ b/include/nbl/video/IGPUCommandBuffer.h
@@ -92,7 +92,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
                 case STATE::EXECUTABLE:
                     [[fallthrough]];
                 case STATE::PENDING:
-                    if (m_noCommands)
+                    if (!m_noCommands)
                         return false;
                     [[fallthrough]];
                 default:
@@ -260,13 +260,21 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         inline bool buildAccelerationStructures(const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> infos, const IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos)
         {
             if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,buildRangeInfos); totalGeometryCount)
-                return buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount);
+            if (buildAccelerationStructures_impl(infos,buildRangeInfos,totalGeometryCount))
+            {
+                m_noCommands = false;
+                return true;
+            }
             return false;
         }
         inline bool buildAccelerationStructures(const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo> infos, const IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos buildRangeInfos)
         {
             if (buildAccelerationStructures_common(infos,buildRangeInfos))
-                return buildAccelerationStructures_impl(infos,buildRangeInfos);
+            if (buildAccelerationStructures_impl(infos,buildRangeInfos))
+            {
+                m_noCommands = false;
+                return true;
+            }
             return false;
         }
         // We don't allow different indirect command addresses due to https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkCmdBuildAccelerationStructuresIndirectKHR-pIndirectDeviceAddresses-03646
@@ -299,18 +307,25 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 
             if (const auto totalGeometryCount=buildAccelerationStructures_common(infos,maxPrimitiveOrInstanceCounts,indirectRangeBuffer); totalGeometryCount)
             {
+                bool success;
                 if constexpr(std::is_same_v<AccelerationStructure,IGPUBottomLevelAccelerationStructure>)
-                    return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount);
+                    success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts,totalGeometryCount);
                 else
-                    return buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts);
+                    success = buildAccelerationStructuresIndirect_impl(indirectRangeBuffer,infos,pIndirectOffsets,pIndirectStrides,maxPrimitiveOrInstanceCounts);
+                if (success)
+                    m_noCommands = false;
+                return success;
             }
             return false;
         }
         
         //! acceleration structure transfers
-        bool copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo);
-        bool copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo);
-        bool copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo);
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        bool copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo);
 
         //! state setup
         bool bindComputePipeline(const IGPUComputePipeline* const pipeline);
@@ -536,7 +551,31 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         bool executeCommands(const uint32_t count, IGPUCommandBuffer* const* const cmdbufs);
 
         // in case you want the commandbuffer to hold onto things as long as its not RESET
-        bool recordReferences(const std::span<const IReferenceCounted*> refs);
+        template<typename Iterator>
+        inline bool recordReferences(Iterator begin, const Iterator end)
+        {
+            auto oit = reserveReferences(std::distance(begin,end));
+            if (oit)
+            while (begin!=end)
+                *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(begin++));
+            return oit;
+        }
+        inline bool recordReferences(const std::span<const IReferenceCounted*> refs) {return recordReferences(refs.begin(),refs.end());}
+
+        // in case you want the commandbuffer to overwrite the BLAS tracking, e.g. you recorded TLAS building commands directly using `getNativeHandle()` to get the commandbuffer
+        template<typename Iterator>
+        inline bool recordBLASReferenceOverwrite(IGPUTopLevelAccelerationStructure* tlas, Iterator beginBLASes, const Iterator endBLASes)
+        {
+            const auto size = std::distance(beginBLASes,endBLASes);
+            auto oit = reserveReferences(size);
+            if (oit)
+            {
+                m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=tlas});
+                while (beginBLASes!=endBLASes)
+                    *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(*(beginBLASes++));
+            }
+            return oit;
+        }
 
         virtual bool insertDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
         virtual bool beginDebugMarker(const char* name, const core::vector4df_SIMD& color = core::vector4df_SIMD(1.0, 1.0, 1.0, 1.0)) = 0;
@@ -627,9 +666,9 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             const uint64_t* const pIndirectOffsets, const uint32_t* const pIndirectStrides, const uint32_t* const pMaxInstanceCounts
         ) = 0;
 
-        virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0;
-        virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) = 0;
-        virtual bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) = 0;
+        virtual bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0;
+        virtual bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst) = 0;
+        virtual bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst) = 0;
 
         virtual bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) = 0;
         virtual bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) = 0;
@@ -710,7 +749,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
             m_state = STATE::INITIAL;
 
             m_boundDescriptorSetsRecord.clear();
-            m_TLASToBLASReferenceSets.clear();
+            m_TLASTrackingOps.clear();
             m_boundGraphicsPipeline= nullptr;
             m_boundComputePipeline= nullptr;
             m_boundRayTracingPipeline= nullptr;
@@ -728,7 +767,7 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         {
             deleteCommandList();
             m_boundDescriptorSetsRecord.clear();
-            m_TLASToBLASReferenceSets.clear();
+            m_TLASTrackingOps.clear();
             m_boundGraphicsPipeline= nullptr;
             m_boundComputePipeline= nullptr;
             m_boundRayTracingPipeline= nullptr;
@@ -862,16 +901,33 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
         template<typename IndirectCommand> requires nbl::is_any_of_v<IndirectCommand, hlsl::DrawArraysIndirectCommand_t, hlsl::DrawElementsIndirectCommand_t>
         bool invalidDrawIndirectCount(const asset::SBufferBinding<const IGPUBuffer>& indirectBinding, const asset::SBufferBinding<const IGPUBuffer>& countBinding, const uint32_t maxDrawCount, const uint32_t stride);
 
+        core::smart_refctd_ptr<const core::IReferenceCounted>* reserveReferences(const uint32_t size);
 
         // This bound descriptor set record doesn't include the descriptor sets whose layout has _any_ one of its bindings
         // created with IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT
         // or IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT.
         core::unordered_map<const IGPUDescriptorSet*,uint64_t> m_boundDescriptorSetsRecord;
-
-        // If the user wants the builds to be tracking, and make the TLAS remember the BLASes that have been built into it.
-        // NOTE: We know that a TLAS may be rebuilt multiple times per frame on purpose and not only the final BLASes need to be kept alive till submission finishes.
-        // However, the Command Pool already tracks resources referenced in the Build Infos, so we only need pointers into those records.
-        core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
+        
+        // If the user wants the builds and copies to be tracking, and make the TLAS remember the BLASes that have been built into it.
+        // The Command Pool already tracks resources referenced in the Build Infos or Copies From Memory (Deserializations), so we only need pointers into those records.
+        struct TLASTrackingWrite
+        {
+            std::span<const core::smart_refctd_ptr<const IReferenceCounted>> src;
+            IGPUTopLevelAccelerationStructure* dst;
+        };
+        struct TLASTrackingCopy
+        {
+            const IGPUTopLevelAccelerationStructure* src;
+            IGPUTopLevelAccelerationStructure* dst;
+        };
+        struct TLASTrackingRead
+        {
+            const IGPUTopLevelAccelerationStructure* src;
+            // For a copy to memory (Serialization), we need to dump the BLASes references
+            core::smart_refctd_dynamic_array<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> dst;
+        };
+        // operations as they'll be performed in order
+        core::vector<std::variant<TLASTrackingWrite,TLASTrackingCopy,TLASTrackingRead>> m_TLASTrackingOps;
 
         const IGPUGraphicsPipeline* m_boundGraphicsPipeline;
         const IGPUComputePipeline* m_boundComputePipeline;
@@ -892,6 +948,13 @@ class NBL_API2 IGPUCommandBuffer : public IBackendObject
 NBL_ENUM_ADD_BITWISE_OPERATORS(IGPUCommandBuffer::USAGE);
 
 #ifndef _NBL_VIDEO_I_GPU_COMMAND_BUFFER_CPP_
+extern template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+extern template bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+
 extern template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo,IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
diff --git a/include/nbl/video/IGPUComputePipeline.h b/include/nbl/video/IGPUComputePipeline.h
index 49e44dfcc1..1b6cbd69f2 100644
--- a/include/nbl/video/IGPUComputePipeline.h
+++ b/include/nbl/video/IGPUComputePipeline.h
@@ -6,20 +6,21 @@
 
 
 #include "nbl/asset/IPipeline.h"
+#include "nbl/asset/IComputePipeline.h"
 
-#include "nbl/video/SPipelineCreationParams.h"
+#include "nbl/video/IGPUPipeline.h"
 #include "nbl/video/SPipelineCreationParams.h"
 
 
 namespace nbl::video
 {
 
-class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const IGPUPipelineLayout>
+class IGPUComputePipeline : public IGPUPipeline<asset::IComputePipeline<const IGPUPipelineLayout>>
 {
-        using pipeline_t = asset::IPipeline<const IGPUPipelineLayout>;
+        using pipeline_t = asset::IComputePipeline<const IGPUPipelineLayout>;
 
     public:
-        struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUComputePipeline>
+        struct SCreationParams final : SPipelineCreationParams<const IGPUComputePipeline>
         {
             // By construction we satisfy from:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-03365
@@ -28,7 +29,7 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
             // and:
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07367
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-flags-07996
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::SCreationParams::FLAGS::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::FLAGS::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),
@@ -46,28 +47,35 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
 
             inline SSpecializationValidationResult valid() const
             {
-                const int32_t dataSize = shader.valid();
-                if (dataSize<0)
-                    return {};
                 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkComputePipelineCreateInfo.html#VUID-VkComputePipelineCreateInfo-stage-00701
-                if (!layout || shader.stage!=hlsl::ShaderStage::ESS_COMPUTE)
+                if (!layout)
+                    return {};
+
+                SSpecializationValidationResult retval = {
+                    .count = 0,
+                    .dataSize = 0,
+                };
+
+                if (!shader.accumulateSpecializationValidationResult(&retval))
                     return {};
 
-                uint32_t count = 0;
-                if (shader.entries)
+                return retval;
+            }
+
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                if (shader.shader && shader.requiredSubgroupSize >= asset::IPipelineBase::SUBGROUP_SIZE::REQUIRE_4)
                 {
-                    if (shader.entries->size()>0x7fffffff)
-                        return {};
-                    count = static_cast<uint32_t>(shader.entries->size());
+                    return hlsl::ESS_COMPUTE;
                 }
-                return {.count=dataSize ? count:0,.dataSize=static_cast<uint32_t>(dataSize)};
+                return {};
             }
 
-            inline std::span<const IPipelineBase::SShaderSpecInfo> getShaders() const {return {&shader,1}; }
-
+            IGPUPipelineLayout* layout = nullptr;
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-            IPipelineBase::SShaderSpecInfo shader = {};
+            SCachedCreationParams cached = {};
+            SShaderSpecInfo shader = {};
         };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}
@@ -76,10 +84,9 @@ class IGPUComputePipeline : public IBackendObject, public asset::IPipeline<const
         virtual const void* getNativeHandle() const = 0;
 
     protected:
-        inline IGPUComputePipeline(core::smart_refctd_ptr<const IGPUPipelineLayout>&& _layout, const core::bitflag<SCreationParams::FLAGS> _flags) :
-            IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(_layout->getOriginDevice())),
-            pipeline_t(std::move(_layout)),
-            m_flags(_flags) {}
+        inline IGPUComputePipeline(const SCreationParams& params) :
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached), m_flags(params.flags)
+        {}
         virtual ~IGPUComputePipeline() = default;
 
         const core::bitflag<SCreationParams::FLAGS> m_flags;
diff --git a/include/nbl/video/IGPUGraphicsPipeline.h b/include/nbl/video/IGPUGraphicsPipeline.h
index 8240bcea94..7027252b0f 100644
--- a/include/nbl/video/IGPUGraphicsPipeline.h
+++ b/include/nbl/video/IGPUGraphicsPipeline.h
@@ -6,20 +6,21 @@
 
 #include "nbl/video/IGPUPipelineLayout.h"
 #include "nbl/video/IGPURenderpass.h"
+#include "nbl/video/IGPUPipeline.h"
 
 
 namespace nbl::video
 {
 
-class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipeline<const IGPUPipelineLayout,const IGPURenderpass>
+class IGPUGraphicsPipeline : public IGPUPipeline<asset::IGraphicsPipeline<const IGPUPipelineLayout, const IGPURenderpass>>
 {
         using pipeline_t = asset::IGraphicsPipeline<const IGPUPipelineLayout,const IGPURenderpass>;
 
     public:
-		struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPUGraphicsPipeline>
-		{
+        struct SCreationParams final : public SPipelineCreationParams<const IGPUGraphicsPipeline>
+        {
             public:
-            #define base_flag(F) static_cast<uint64_t>(pipeline_t::SCreationParams::FLAGS::F)
+            #define base_flag(F) static_cast<uint64_t>(pipeline_t::FLAGS::F)
             enum class FLAGS : uint64_t
             {
                 NONE = base_flag(NONE),
@@ -36,30 +37,79 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
                 if (!layout)
                     return {};
                 SSpecializationValidationResult retval = {.count=0,.dataSize=0};
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const IPipelineBase::SShaderSpecInfo& info)->bool
+                if (!layout)
+                    return {};
+
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkGraphicsPipelineCreateInfo.html#VUID-VkGraphicsPipelineCreateInfo-dynamicRendering-06576
+                if (!renderpass || cached.subpassIx>=renderpass->getSubpassCount())
+                    return {};
+
+                // TODO: check rasterization samples, etc.
+                //rp->getCreationParameters().subpasses[i]
+
+                core::bitflag<hlsl::ShaderStage> stagePresence = {};
+
+                auto processSpecInfo = [&](const SShaderSpecInfo& specInfo, hlsl::ShaderStage stage)
                 {
-                    const auto dataSize = info.valid();
-                    if (dataSize<0)
-                        return false;
-                    else if (dataSize==0)
-                        return true;
-
-                    const size_t count = info.entries ? info.entries->size():0x80000000ull;
-                    if (count>0x7fffffff)
-                        return {};
-                    retval += {.count=dataSize ? static_cast<uint32_t>(count):0,.dataSize=static_cast<uint32_t>(dataSize)};
-                    return retval;
-                });
-                if (!valid)
+                    if (!specInfo.shader) return true;
+                    if (!specInfo.accumulateSpecializationValidationResult(&retval)) return false;
+                    stagePresence |= stage;
+                    return true;
+                };
+                if (!processSpecInfo(vertexShader, hlsl::ShaderStage::ESS_VERTEX)) return {};
+                if (!processSpecInfo(tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL)) return {};
+                if (!processSpecInfo(tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)) return {};
+                if (!processSpecInfo(geometryShader, hlsl::ShaderStage::ESS_GEOMETRY)) return {};
+                if (!processSpecInfo(fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT)) return {};
+                
+                if (!hasRequiredStages(stagePresence, cached.primitiveAssembly.primitiveType))
                     return {};
+
+                if (!vertexShader.shader) return {};
+
                 return retval;
             }
 
-            inline std::span<const IPipelineBase::SShaderSpecInfo> getShaders() const {return shaders;}
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                core::bitflag<hlsl::ShaderStage> stages = {};
+                auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                {
+                    if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                      stages |= stage;
+                    }
+                };
+                processSpecInfo(vertexShader, hlsl::ESS_VERTEX);
+                processSpecInfo(tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL);
+                processSpecInfo(tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION);
+                processSpecInfo(geometryShader, hlsl::ESS_GEOMETRY);
+                processSpecInfo(fragmentShader, hlsl::ESS_FRAGMENT);
+                return stages;
+            }
+
+            IGPUPipelineLayout* layout = nullptr;
+            SShaderSpecInfo vertexShader;
+            SShaderSpecInfo tesselationControlShader;
+            SShaderSpecInfo tesselationEvaluationShader;
+            SShaderSpecInfo geometryShader;
+            SShaderSpecInfo fragmentShader;
+            SCachedCreationParams cached = {};
+            renderpass_t* renderpass = nullptr;
 
             // TODO: Could guess the required flags from SPIR-V introspection of declared caps
             core::bitflag<FLAGS> flags = FLAGS::NONE;
-		};
+
+            inline uint32_t getShaderCount() const
+            {
+                uint32_t count = 0;
+                count += (vertexShader.shader != nullptr);
+                count += (tesselationControlShader.shader != nullptr);
+                count += (tesselationEvaluationShader.shader != nullptr);
+                count += (geometryShader.shader != nullptr);
+                count += (fragmentShader.shader != nullptr);
+                return count;
+            }
+        };
 
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const {return m_flags;}
 
@@ -67,9 +117,10 @@ class IGPUGraphicsPipeline : public IBackendObject, public asset::IGraphicsPipel
         virtual const void* getNativeHandle() const = 0;
 
     protected:
-        IGPUGraphicsPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice())),
-            pipeline_t(params), m_flags(params.flags) {}
-        virtual ~IGPUGraphicsPipeline() = default;
+        IGPUGraphicsPipeline(const SCreationParams& params) :
+          IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached, params.renderpass), m_flags(params.flags)
+        {}
+        virtual ~IGPUGraphicsPipeline() override = default;
 
         const core::bitflag<SCreationParams::FLAGS> m_flags;
 };
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
new file mode 100644
index 0000000000..c22ad998db
--- /dev/null
+++ b/include/nbl/video/IGPUPipeline.h
@@ -0,0 +1,149 @@
+
+
+// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
+#define _NBL_VIDEO_I_GPU_PIPELINE_H_INCLUDED_
+
+#include "nbl/video/IGPUPipelineLayout.h"
+#include "nbl/video/SPipelineCreationParams.h"
+#include "nbl/asset/ICPUPipeline.h"
+#include "nbl/asset/IPipeline.h"
+
+namespace nbl::video
+{
+
+class IGPUPipelineBase {
+    public:
+        struct SShaderSpecInfo
+        {
+
+            //! Structure specifying a specialization map entry
+            /*
+              Note that if specialization constant ID is used
+              in a shader, \bsize\b and \boffset'b must match 
+              to \isuch an ID\i accordingly.
+
+              By design the API satisfies:
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-offset-00773
+              https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-pMapEntries-00774
+            */
+            //!< The ID of the specialization constant in SPIR-V. If it isn't used in the shader, the map entry does not affect the behavior of the pipeline.
+            using spec_constant_id_t = uint32_t;
+
+            using SSpecConstantValue = std::span<const uint8_t>;
+
+            inline SSpecConstantValue getSpecializationByteValue(const spec_constant_id_t _specConstID) const
+            {
+                if (!entries) return {};
+
+                const auto found = entries->find(_specConstID);
+                if (found != entries->end() && found->second.size()) return found->second;
+                else return {};
+            }
+
+            static constexpr int32_t INVALID_SPEC_INFO = -1;
+            inline int32_t valid() const
+            {
+                if (!shader) return INVALID_SPEC_INFO;
+
+                // Impossible to check: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pName-00707
+                if (entryPoint.empty()) return INVALID_SPEC_INFO;
+
+                // Impossible to efficiently check anything from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-maxClipDistances-00708
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-06686
+                // and from:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02756
+                // to:
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-module-08987
+                  
+                int64_t specData = 0;
+                if (entries)
+                {
+                    for (const auto& entry : *entries)
+                    {
+                      if (!entry.second.size())
+                          return INVALID_SPEC_INFO;
+                      specData += entry.second.size();
+                    }
+                }
+                if (specData>0x7fffffff)
+                    return INVALID_SPEC_INFO;
+                return static_cast<int32_t>(specData);
+            }
+
+            inline bool accumulateSpecializationValidationResult(SSpecializationValidationResult* retval) const
+            {
+                const auto dataSize = valid();
+                if (dataSize < 0)
+                    return false;
+                if (dataSize == 0)
+                    return true;
+
+                const size_t count = entries ? entries->size() : 0x80000000ull;
+                if (count > 0x7fffffff)
+                    return false;
+                *retval += {
+                    .count = dataSize ? static_cast<uint32_t>(count) : 0,
+                    .dataSize = static_cast<uint32_t>(dataSize),
+                };
+                return *retval;
+            }
+
+            const asset::IShader* shader = nullptr;
+            std::string_view entryPoint = "";
+
+            asset::IPipelineBase::SUBGROUP_SIZE requiredSubgroupSize = asset::IPipelineBase::SUBGROUP_SIZE::UNKNOWN;	//!< Default value of 8 means no requirement
+
+            // Container choice implicitly satisfies:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkSpecializationInfo.html#VUID-VkSpecializationInfo-constantID-04911
+            using entry_map_t = core::unordered_map<spec_constant_id_t, SSpecConstantValue>;
+            const entry_map_t* entries;
+            // By requiring Nabla Core Profile features we implicitly satisfy:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02784
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-flags-02785
+            // Also because our API is sane, it satisfies the following by construction:
+            // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
+
+
+            static inline SShaderSpecInfo create(const asset::ICPUPipelineBase::SShaderSpecInfo& cpuSpecInfo, entry_map_t* outEntries)  
+            {
+                SShaderSpecInfo specInfo;
+                specInfo.shader = cpuSpecInfo.shader.get();
+                specInfo.entryPoint = cpuSpecInfo.entryPoint;
+                specInfo.requiredSubgroupSize = cpuSpecInfo.requiredSubgroupSize;
+                outEntries->clear();
+                for (const auto&[key, value] : cpuSpecInfo.entries)
+                {
+                    outEntries->insert({ key, { value.data(), value.size() } });
+                }
+                specInfo.entries = outEntries;
+                return specInfo;
+            };
+        };
+
+        using SShaderEntryMap = SShaderSpecInfo::entry_map_t;
+
+};
+
+// Common Base class for pipelines
+template<typename PipelineNonBackendObjectBase>
+    requires (std::is_base_of_v<asset::IPipeline<const IGPUPipelineLayout>, PipelineNonBackendObjectBase> && !std::is_base_of_v<IBackendObject, PipelineNonBackendObjectBase>)
+class IGPUPipeline : public IBackendObject, public PipelineNonBackendObjectBase, public IGPUPipelineBase
+{
+    protected:
+
+        template <typename... Args>
+        explicit IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>&& device, Args&&... args) :
+         PipelineNonBackendObjectBase(std::forward<Args>(args)...), IBackendObject(std::move(device))
+        {}
+        virtual ~IGPUPipeline() = default;
+
+};
+
+}
+
+#endif
diff --git a/include/nbl/video/IGPURayTracingPipeline.h b/include/nbl/video/IGPURayTracingPipeline.h
index fb8c371193..690e6685d3 100644
--- a/include/nbl/video/IGPURayTracingPipeline.h
+++ b/include/nbl/video/IGPURayTracingPipeline.h
@@ -10,28 +10,43 @@
 namespace nbl::video
 {
 
-class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingPipeline<const IGPUPipelineLayout>
+class IGPURayTracingPipeline :  public IGPUPipeline<asset::IRayTracingPipeline<const IGPUPipelineLayout>>
 {
         using pipeline_t = asset::IRayTracingPipeline<const IGPUPipelineLayout>;
 
     public:
-
-        struct SShaderGroupHandle
+        struct SHitGroup
         {
-          private:
-            uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize];
+            SShaderSpecInfo closestHit;
+            SShaderSpecInfo anyHit;
+            SShaderSpecInfo intersection;
         };
-        static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize);
 
-        struct SHitGroupStackSize
+        struct SCreationParams : public SPipelineCreationParams<const IGPURayTracingPipeline>
         {
-            uint16_t closestHit;
-            uint16_t anyHit;
-            uint16_t intersection;
-        };
+            using FLAGS = pipeline_t::FLAGS;
 
-        struct SCreationParams final : pipeline_t::SCreationParams, SPipelineCreationParams<const IGPURayTracingPipeline>
-        {
+            struct SShaderGroupsParams
+            {
+
+                SShaderSpecInfo raygen;
+                std::span<SShaderSpecInfo> misses;
+                std::span<SHitGroup> hits;
+                std::span<SShaderSpecInfo> callables;
+
+                inline uint32_t getShaderGroupCount() const
+                {
+                    return 1 + hits.size() + misses.size() + callables.size();
+                }
+
+            };
+
+            IGPUPipelineLayout* layout = nullptr;
+            SShaderGroupsParams shaderGroups;
+
+            SCachedCreationParams cached = {};
+            // TODO: Could guess the required flags from SPIR-V introspection of declared caps
+            core::bitflag<FLAGS> flags = FLAGS::NONE;
 
             inline SSpecializationValidationResult valid() const
             {
@@ -39,32 +54,104 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
                   return {};
 
                 SSpecializationValidationResult retval = {
-                    .count=0,
-                    .dataSize=0,
+                    .count = 0,
+                    .dataSize = 0,
                 };
-                const bool valid = pipeline_t::SCreationParams::impl_valid([&retval](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
+
+                if (!shaderGroups.raygen.accumulateSpecializationValidationResult(&retval))
+                    return {};
+
+                for (const auto& shaderGroup : shaderGroups.hits)
                 {
-                    const auto dataSize = info.valid();
-                    if (dataSize<0)
-                        return false;
-                    else if (dataSize==0)
-                        return true;
-
-                    const size_t count = info.entries ? info.entries->size():0x80000000ull;
-                    if (count>0x7fffffff)
+                    if (shaderGroup.intersection.shader) 
+                    {
+                      if (!shaderGroup.intersection.accumulateSpecializationValidationResult(&retval))
                         return {};
-                    retval += {.count=dataSize ? static_cast<uint32_t>(count):0,.dataSize=static_cast<uint32_t>(dataSize)};
-                    return retval;
-                });
-                if (!valid)
-                    return {};
+                    }
+
+                    if (shaderGroup.closestHit.shader) 
+                    {
+                      if (!shaderGroup.closestHit.accumulateSpecializationValidationResult(&retval))
+                        return {};
+                    }
+
+                    // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03470
+                    if (flags.hasFlags(FLAGS::NO_NULL_ANY_HIT_SHADERS) && !shaderGroup.anyHit.shader)
+                        return {};
+
+                    if (shaderGroup.anyHit.shader) 
+                    {
+                      if (!shaderGroup.anyHit.accumulateSpecializationValidationResult(&retval))
+                        return {};
+                    }
+
+                    // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-flags-03471
+                    if (flags.hasFlags(FLAGS::NO_NULL_CLOSEST_HIT_SHADERS) && !shaderGroup.intersection.shader)
+                        return {};
+                }
+
+                for (const auto& miss : shaderGroups.misses)
+                {
+                  if (miss.shader) 
+                  {
+                    if (!miss.accumulateSpecializationValidationResult(&retval))
+                      return {};
+                  }
+                }
+
+                for (const auto& callable : shaderGroups.callables)
+                {
+                  if (callable.shader)
+                  {
+                    if (!callable.accumulateSpecializationValidationResult(&retval))
+                      return {};
+                  }
+                }
+
+                if (!shaderGroups.raygen.shader) return {};
+
                 return retval;
             }
 
-            inline std::span<const asset::IPipelineBase::SShaderSpecInfo> getShaders() const { return shaders; }
+            inline core::bitflag<hlsl::ShaderStage> getRequiredSubgroupStages() const
+            {
+                core::bitflag<hlsl::ShaderStage> stages = {};
+                auto processSpecInfo = [&](const SShaderSpecInfo& spec, hlsl::ShaderStage stage)
+                {
+                    if (spec.shader && spec.requiredSubgroupSize >= SUBGROUP_SIZE::REQUIRE_4) {
+                      stages |= stage;
+                    }
+                };
+                processSpecInfo(shaderGroups.raygen, hlsl::ESS_RAYGEN);
+                for (const auto& miss : shaderGroups.misses)
+                    processSpecInfo(miss, hlsl::ESS_MISS);
+                for (const auto& hit : shaderGroups.hits)
+                {
+                    processSpecInfo(hit.closestHit, hlsl::ESS_CLOSEST_HIT);
+                    processSpecInfo(hit.anyHit, hlsl::ESS_ANY_HIT);
+                    processSpecInfo(hit.intersection, hlsl::ESS_INTERSECTION);
+                }
+                for (const auto& callable : shaderGroups.callables)
+                    processSpecInfo(callable, hlsl::ESS_CALLABLE);
+                return stages;
+            }
 
         };
 
+        struct SShaderGroupHandle
+        {
+          private:
+            uint8_t data[video::SPhysicalDeviceLimits::ShaderGroupHandleSize];
+        };
+        static_assert(sizeof(SShaderGroupHandle) == video::SPhysicalDeviceLimits::ShaderGroupHandleSize);
+
+        struct SHitGroupStackSize
+        {
+            uint16_t closestHit;
+            uint16_t anyHit;
+            uint16_t intersection;
+        };
+
         inline core::bitflag<SCreationParams::FLAGS> getCreationFlags() const { return m_flags; }
 
         // Vulkan: const VkPipeline*
@@ -82,8 +169,7 @@ class IGPURayTracingPipeline : public IBackendObject, public asset::IRayTracingP
         virtual uint16_t getDefaultStackSize() const = 0;
 
     protected:
-        IGPURayTracingPipeline(const SCreationParams& params) : IBackendObject(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice())),
-            pipeline_t(params),
+        IGPURayTracingPipeline(const SCreationParams& params) : IGPUPipeline(core::smart_refctd_ptr<const ILogicalDevice>(params.layout->getOriginDevice()), params.layout, params.cached),
             m_flags(params.flags)
         {}
 
diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h
index 49364f3a54..def3ee0979 100644
--- a/include/nbl/video/ILogicalDevice.h
+++ b/include/nbl/video/ILogicalDevice.h
@@ -3,7 +3,7 @@
 
 #include "nbl/asset/asset.h"
 #include "nbl/asset/utils/ISPIRVOptimizer.h"
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/asset/utils/CCompilerSet.h"
 
 #include "nbl/video/SPhysicalDeviceFeatures.h"
@@ -413,19 +413,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         };
         // fun fact: you can use garbage/invalid pointers/offset for the Device/Host addresses of the per-geometry data, just make sure what was supposed to be null is null
         template<class Geometry> requires nbl::is_any_of_v<Geometry,
-            IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>,
-            IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>
+            asset::IBottomLevelAccelerationStructure::Triangles<IGPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::AABBs<IGPUBuffer>,
+            asset::IBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>
         >
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags,
+            const bool hostBuild,
+            const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags,
             const bool motionBlur,
             const std::span<const Geometry> geometries,
             const uint32_t* const pMaxPrimitiveCounts
         ) const
         {
-            if (invalidFeaturesForASBuild<typename Geometry::buffer_t>(motionBlur))
+            if (invalidFeaturesForASBuild(hostBuild,motionBlur))
             {
                 NBL_LOG_ERROR("Required features are not enabled");
                 return {};
@@ -456,6 +457,30 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             uint32_t primsFree = limits.maxAccelerationStructurePrimitiveCount;
 			for (auto i=0u; i<geometries.size(); i++)
             {
+                const auto& geom = geometries[i];
+                if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles)
+                {
+                    if (flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+                    {
+                        NBL_LOG_ERROR("Primitive type is Triangles but build flag says BLAS build is AABBs");
+                        return {};
+                    }
+                    if (!getPhysicalDevice()->getBufferFormatUsages()[geom.vertexFormat].accelerationStructureVertex)
+                    {
+                        NBL_LOG_ERROR("Vertex Format %d not supported as Acceleration Structure Vertex Position Input on this Device",geom.vertexFormat);
+                        return {};
+                    }
+                    // TODO: do we check `maxVertex`, `vertexStride` and `indexType` for validity
+                }
+                if constexpr (Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::AABBs)
+                {
+                    if (!flags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+                    {
+                        NBL_LOG_ERROR("Primitive type is AABB but build flag says BLAS build is not AABBs");
+                        return {};
+                    }
+                    // TODO: check stride and geometry flags for validity
+                }
                 if (pMaxPrimitiveCounts[i] > primsFree)
                 {
                     NBL_LOG_ERROR("Primitive count exceeds device limit");
@@ -464,16 +489,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 primsFree -= pMaxPrimitiveCounts[i];
             }
 
-            return getAccelerationStructureBuildSizes_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(
             const bool hostBuild,
-            const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags,
+            const core::bitflag<asset::ITopLevelAccelerationStructure::BUILD_FLAGS> flags,
             const bool motionBlur,
             const uint32_t maxInstanceCount
         ) const
         {
-            if (invalidFeaturesForASBuild<IGPUBuffer>(motionBlur))
+            if (invalidFeaturesForASBuild(hostBuild,motionBlur))
             {
                 NBL_LOG_ERROR("Required features are not enabled");
                 return {};
@@ -497,7 +522,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         }
         // little utility
         template<typename BufferType=IGPUBuffer>
-        inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur, const uint32_t maxInstanceCount) const
+        inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes(const core::bitflag<asset::ITopLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur, const uint32_t maxInstanceCount) const
         {
             return getAccelerationStructureBuildSizes(std::is_same_v<std::remove_cv_t<BufferType>,asset::ICPUBuffer>,flags,motionBlur,maxInstanceCount);
         }
@@ -568,12 +593,14 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                         {
                             auto tlas = set.first;
                             // we know the build is completed immediately after performing it, so we get our pending stamp then
-                            tlas->setTrackedBLASes(set.second.begin(),set.second.end(),tlas->registerNextBuildVer());
+                            // ideally we should get our build version when the work of the deferred op gets executed for the first time
+                            const auto buildVer = tlas->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({set.second.begin()},{set.second.end()});
+                            tlas->clearTrackedBLASes(buildVer);
                         }
                     }
 
                     // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
-                    core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t>> m_TLASToBLASReferenceSets;
+                    core::unordered_map<IGPUTopLevelAccelerationStructure*,std::span<const core::smart_refctd_ptr<const IReferenceCounted>>> m_TLASToBLASReferenceSets;
                 } callback = {};
 
                 auto& tracking = deferredOperation->m_resourceTracking;
@@ -585,10 +612,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     if constexpr (IsTLAS)
                     {
                         const auto blasCount = info.trackedBLASes.size();
-                        if (blasCount)
-                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
-                        else
-                            callback.m_TLASToBLASReferenceSets[info.dstAS] = {};
+                        callback.m_TLASToBLASReferenceSets[info.dstAS] = {oit-blasCount,blasCount};
                     }
                 }
                 if constexpr (IsTLAS)
@@ -633,7 +657,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             return writeAccelerationStructuresProperties_impl(accelerationStructures,type,data,stride);
         }
         // Host-side copy, DEFERRAL IS NOT OPTIONAL
-        inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructure(IDeferredOperation* const deferredOperation, const AccelerationStructure::CopyInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -647,15 +672,48 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructure_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            // not sure if even legal, but it would deadlock us
+                            if (src==dst)
+                                return;
+                            uint32_t buildVer;
+                            {
+                                // stop multiple threads messing with us
+                                std::lock_guard lk(src->m_trackingLock);
+                                // we know the build is completed immediately after performing it, so we get our pending stamp then
+                                // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time
+                                const auto* pSrcBLASes = src->getPendingBuildTrackedBLASes(src->getPendingBuildVer());
+                                const std::span<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> emptySpan = {};
+                                buildVer = pSrcBLASes ? dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end());
+                            }
+                            dst->clearTrackedBLASes(buildVer);
+                        }
+
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        const IGPUTopLevelAccelerationStructure* src;
+                        IGPUTopLevelAccelerationStructure* dst;
+                    } callback = {.src=copyInfo.src,.dst=copyInfo.dst};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             
 
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
-        inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructureToMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyToMemoryInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -674,13 +732,43 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructureToMemory_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst.buffer)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            // stop multiple threads messing with us
+                            std::lock_guard lk(src->m_trackingLock);
+                            // we know the build is completed immediately after performing it, so we get our pending stamp then
+                            // ideally we should get the BLAS set from the Source TLAS when the work of the deferred op gets executed for the first time
+                            const auto ver = src->getPendingBuildVer();
+                            uint32_t count = dst->size();
+                            src->getPendingBuildTrackedBLASes(&count,dst->data(),ver);
+                            if (count>dst->size())
+                                logger->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,dst->size());
+                        }
+
+                        // device keeps it alive for entire lifetime of the callback
+                        system::ILogger* logger;
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        const IGPUTopLevelAccelerationStructure* src;
+                        core::smart_refctd_dynamic_array<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> dst;
+                    } callback = {.logger=m_logger.get(),.src=copyInfo.src,.dst=copyInfo.trackedBLASes};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
-        inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo)
+        template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+        inline bool copyAccelerationStructureFromMemory(IDeferredOperation* const deferredOperation, const AccelerationStructure::HostCopyFromMemoryInfo& copyInfo)
         {
             if (!acquireDeferredOperation(deferredOperation))
             {
@@ -699,10 +787,32 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             auto result = copyAccelerationStructureFromMemory_impl(deferredOperation,copyInfo);
             if (result==DEFERRABLE_RESULT::DEFERRED)
+            {
                 deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.begin(),{
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.src.buffer),
                     core::smart_refctd_ptr<const IReferenceCounted>(copyInfo.dst)
                 });
+                constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+                if constexpr (IsTLAS)
+                {
+                    const size_t offset = deferredOperation->m_resourceTracking.size();
+                    deferredOperation->m_resourceTracking.insert(deferredOperation->m_resourceTracking.end(),copyInfo.trackedBLASes.begin(),copyInfo.trackedBLASes.end());
+                    struct TLASCallback
+                    {
+                        // upon completion set the BLASes tracked
+                        inline void operator()(IDeferredOperation*) const
+                        {
+                            const auto buildVer = dst->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({src->begin()},{src->end()});
+                            dst->clearTrackedBLASes(buildVer);
+                        }
+
+                        // the rawpointers are already smartpointers in whatever else the `fillTracking` declared above writes
+                        std::span<const core::smart_refctd_ptr<const IReferenceCounted>> src;
+                        IGPUTopLevelAccelerationStructure* dst;
+                    } callback = {.src={deferredOperation->m_resourceTracking.data()+offset,copyInfo.trackedBLASes.size()},.dst=copyInfo.dst};
+                    deferredOperation->m_callback = std::move(callback);
+                }
+            }
             return result!=DEFERRABLE_RESULT::SOME_ERROR;
         }
 
@@ -725,8 +835,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         // Create a pipeline layout (@see ICPUPipelineLayout)
         core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout(
             const std::span<const asset::SPushConstantRange> pcRanges={},
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0=nullptr, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1=nullptr,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2=nullptr, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3=nullptr
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0=nullptr, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1=nullptr,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2=nullptr, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3=nullptr
         )
         {
             if ((_layout0 && !_layout0->wasCreatedBy(this)))
@@ -1020,20 +1130,20 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0;
 
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const asset::IBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const = 0;
         virtual AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
             const bool hostBuild, const core::bitflag<IGPUTopLevelAccelerationStructure::BUILD_FLAGS> flags,
@@ -1055,16 +1165,16 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             const IGPUTopLevelAccelerationStructure::BuildRangeInfo* const pBuildRangeInfos, const uint32_t totalGeometryCount
         ) = 0;
         virtual bool writeAccelerationStructuresProperties_impl(const std::span<const IGPUAccelerationStructure* const> accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) = 0;
-        virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) = 0;
+        virtual DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) = 0;
 
         constexpr static inline auto MaxStagesPerPipeline = 6u;
         virtual core::smart_refctd_ptr<IGPUDescriptorSetLayout> createDescriptorSetLayout_impl(const std::span<const IGPUDescriptorSetLayout::SBinding> bindings, const uint32_t maxSamplersCount) = 0;
         virtual core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout_impl(
             const std::span<const asset::SPushConstantRange> pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3
         ) = 0;
 
         virtual core::smart_refctd_ptr<IDescriptorPool> createDescriptorPool_impl(const IDescriptorPool::SCreateInfo& createInfo) = 0;
@@ -1096,8 +1206,8 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
         virtual core::smart_refctd_ptr<IGPURenderpass> createRenderpass_impl(const IGPURenderpass::SCreationParams& params, IGPURenderpass::SCreationParamValidationResult&& validation) = 0;
         virtual core::smart_refctd_ptr<IGPUFramebuffer> createFramebuffer_impl(IGPUFramebuffer::SCreationParams&& params) = 0;
 
-        template<typename CreationParams, typename ExtraLambda>
-        inline CreationParams::SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params, ExtraLambda&& extra)
+        template<typename CreationParams>
+        inline SSpecializationValidationResult commonCreatePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const CreationParams> params)
         {
             if (pipelineCache && !pipelineCache->wasCreatedBy(this))
             {
@@ -1110,7 +1220,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 return {};
             }
 
-            typename CreationParams::SSpecializationValidationResult retval = {.count=0,.dataSize=0};
+            SSpecializationValidationResult retval = {.count=0,.dataSize=0};
             for (auto i=0; i<params.size(); i++)
             {
                 const auto& ci = params[i];
@@ -1149,69 +1259,12 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                     return {};
                 }
 
-                const auto& features = getEnabledFeatures();
-                for (auto info : ci.getShaders())
-                if (info.shader)
+                // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
+                const auto requiredSubgroupSizeStages = getPhysicalDeviceLimits().requiredSubgroupSizeStages;
+                if (!requiredSubgroupSizeStages.hasFlags(ci.getRequiredSubgroupStages()))
                 {
-                    const asset::IShader::E_SHADER_STAGE shaderStage = info.stage;
-
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02091
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-02092
-                    // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00706
-                    switch (shaderStage)
-                    {
-                        case hlsl::ShaderStage::ESS_TESSELLATION_CONTROL: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION:
-                            if (!features.tessellationShader)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        case hlsl::ShaderStage::ESS_GEOMETRY:
-                            if (!features.geometryShader)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        case hlsl::ShaderStage::ESS_ALL_OR_LIBRARY: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_VERTEX: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_FRAGMENT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_COMPUTE:
-                            break;
-                            // unsupported yet
-                        case hlsl::ShaderStage::ESS_TASK: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_MESH:
-                            NBL_LOG_ERROR("Unsupported (yet) shader stage");
-                            return {};
-                            break;
-                        case hlsl::ShaderStage::ESS_RAYGEN: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_ANY_HIT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_CLOSEST_HIT: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_MISS: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_INTERSECTION: [[fallthrough]];
-                        case hlsl::ShaderStage::ESS_CALLABLE:
-                            if (!features.rayTracingPipeline)
-                            {
-                                NBL_LOG_ERROR("Cannot create IGPUShader for %p, Raytracing Pipeline feature not enabled!", info.shader);
-                                return {};
-                            }
-                            break;
-                        default:
-                            // Implicit unsupported stages or weird multi-bit stage enum values
-                            NBL_LOG_ERROR("Unknown Shader Stage %d", shaderStage);
-                            return {};
-                            break;
-                    }
-
-                    if (!extra(info))
-                    {
-                        NBL_LOG_ERROR("Invalid shader were specified (params[%d])", i);
-                        return {};
-                    }
+                    NBL_LOG_ERROR("Shader stage is not a valid bit specified in requiredSubgroupSizeStages");
+                    return {};
                 }
 
                 retval += validation;
@@ -1222,19 +1275,19 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-            const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
         virtual void createGraphicsPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-            const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
         virtual void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPURayTracingPipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-            const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) = 0;
 
         virtual core::smart_refctd_ptr<IQueryPool> createQueryPool_impl(const IQueryPool::SCreationParams& params) = 0;
@@ -1262,7 +1315,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             uint16_t firstQueueIndex = 0u;
         };
         const std::array<QueueFamilyInfo,MaxQueueFamilies> m_queueFamilyInfos;
-        core::smart_refctd_ptr<asset::ISPIRVDebloater> m_spirvDebloater;
+        core::smart_refctd_ptr<asset::ISPIRVEntryPointTrimmer> m_spirvTrimmer;
         
     private:
         const SPhysicalDeviceLimits& getPhysicalDeviceLimits() const;
@@ -1340,8 +1393,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
             }
             return false;
         }
-        template<class BufferType>
-        bool invalidFeaturesForASBuild(const bool motionBlur) const
+        bool invalidFeaturesForASBuild(const bool hostBuild, const bool motionBlur) const
         {
             // https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkGetAccelerationStructureBuildSizesKHR-accelerationStructure-08933
             if (!m_enabledFeatures.accelerationStructure)
@@ -1350,7 +1402,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe
                 return true;
             }
 			// not sure of VUID
-            if (std::is_same_v<BufferType, asset::ICPUBuffer> && !m_enabledFeatures.accelerationStructureHostCommands)
+            if (hostBuild && !m_enabledFeatures.accelerationStructureHostCommands)
             {
                 NBL_LOG_ERROR("Feature `acceleration structure` host commands is not enabled");
 				return true;
@@ -1535,7 +1587,7 @@ inline bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyInde
             return false;
         };
         // CANNOT CHECK: https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-VkImageMemoryBarrier2-oldLayout-01197
-        if (mismatchedLayout.operator()<false>(barrier.oldLayout) || mismatchedLayout.operator()<true>(barrier.newLayout))
+        if (mismatchedLayout.template operator()<false>(barrier.oldLayout) || mismatchedLayout.template operator()<true>(barrier.newLayout))
             return false;
     }
 
diff --git a/include/nbl/video/IQueue.h b/include/nbl/video/IQueue.h
index 28336b15cc..63073beb33 100644
--- a/include/nbl/video/IQueue.h
+++ b/include/nbl/video/IQueue.h
@@ -125,12 +125,7 @@ class IQueue : public core::Interface, public core::Unmovable
 		class DeferredSubmitCallback final
 		{
                 //
-                struct STLASBuildMetadata
-                {
-                    core::unordered_set<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> m_BLASes;
-                    uint32_t m_buildVer;
-                };
-                core::unordered_map<IGPUTopLevelAccelerationStructure*,STLASBuildMetadata> m_TLASToBLASReferenceSets;
+                core::unordered_map<IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_TLASOverwrites;
                 //
                 using smart_ptr = core::smart_refctd_ptr<IBackendObject>;
                 core::smart_refctd_dynamic_array<smart_ptr> m_resources;
diff --git a/include/nbl/video/ISwapchain.h b/include/nbl/video/ISwapchain.h
index d052a819bd..882ac16648 100644
--- a/include/nbl/video/ISwapchain.h
+++ b/include/nbl/video/ISwapchain.h
@@ -454,21 +454,22 @@ class ISwapchain : public IBackendObject
         {
             return params.deduce(getOriginDevice()->getPhysicalDevice(),m_params.surface.get(),{&m_params.sharedParams.presentMode.value,1},{&m_params.sharedParams.compositeAlpha.value,1},{&m_params.sharedParams.preTransform.value,1});
         }
-        inline core::smart_refctd_ptr<ISwapchain> recreate(SSharedCreationParams params={})
+        inline core::smart_refctd_ptr<ISwapchain> recreate(SSharedCreationParams params)
         {
             if (!deduceRecreationParams(params))
                 return nullptr;
             return recreate_impl(std::move(params));
         }
+        inline core::smart_refctd_ptr<ISwapchain> recreate() { return recreate({}); }
 
         // Vulkan: const VkSwapchainKHR*
         virtual const void* getNativeHandle() const = 0;
         
         // returns the maximum number of time acquires with infinite timeout which can be called before releasing the image index through present.
-        virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0u;
+        virtual uint8_t getMaxBlockingAcquiresBeforePresent() const = 0;
 
         // returns the maximum number of acquires you can request without waiting for previous acquire semaphores to signal.
-        virtual uint8_t getMaxAcquiresInFlight() const = 0u;
+        virtual uint8_t getMaxAcquiresInFlight() const = 0;
 
         // only public because MultiTimelineEventHandlerST needs to know about it
         class DeferredFrameSemaphoreDrop final
diff --git a/include/nbl/video/SPipelineCreationParams.h b/include/nbl/video/SPipelineCreationParams.h
index 489bff4343..3a25560ae4 100644
--- a/include/nbl/video/SPipelineCreationParams.h
+++ b/include/nbl/video/SPipelineCreationParams.h
@@ -11,6 +11,31 @@
 namespace nbl::video
 {
 
+struct SSpecializationValidationResult
+{
+  constexpr static inline uint32_t Invalid = ~0u;
+  inline operator bool() const
+  {
+    return count!=Invalid && dataSize!=Invalid;
+  }
+
+  inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other)
+  {
+    // TODO: check for overflow before adding
+    if (*this && other)
+    {
+      count += other.count;
+      dataSize += other.dataSize;
+    }
+    else
+      *this = {};
+    return *this;
+  }
+
+  uint32_t count = Invalid;
+  uint32_t dataSize = Invalid;
+};
+
 // For now, due to API design we implicitly satisfy:
 // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-08771
 // to:
@@ -18,30 +43,6 @@ namespace nbl::video
 template<typename PipelineType>
 struct SPipelineCreationParams
 {
-	struct SSpecializationValidationResult
-	{
-		constexpr static inline uint32_t Invalid = ~0u;
-		inline operator bool() const
-		{
-			return count!=Invalid && dataSize!=Invalid;
-		}
-
-		inline SSpecializationValidationResult& operator+=(const SSpecializationValidationResult& other)
-		{
-			// TODO: check for overflow before adding
-			if (*this && other)
-			{
-				count += other.count;
-				dataSize += other.dataSize;
-			}
-			else
-				*this = {};
-			return *this;
-		}
-
-		uint32_t count = Invalid;
-		uint32_t dataSize = Invalid;
-	};
 	constexpr static inline int32_t NotDerivingFromPreviousPipeline = -1;
 
 	inline bool isDerivative() const
diff --git a/include/nbl/video/TimelineEventHandlers.h b/include/nbl/video/TimelineEventHandlers.h
index 9405accf78..a3d6aa4c8b 100644
--- a/include/nbl/video/TimelineEventHandlers.h
+++ b/include/nbl/video/TimelineEventHandlers.h
@@ -410,7 +410,7 @@ class MultiTimelineEventHandlerST final : core::Unmovable, core::Uncopyable
                         sum += handler->count();
                     else
                     {
-                        const auto local = handler->poll_impl<false>(std::forward<Args>(args)...);
+                        const auto local = handler->template poll_impl<false>(std::forward<Args>(args)...);
                         bailed = local.bailed;
                         // if don't have any events left, remove the timeline
                         if (local.eventsLeft)
diff --git a/include/nbl/video/asset_traits.h b/include/nbl/video/asset_traits.h
index ee7d068ef3..5b085b2d3b 100644
--- a/include/nbl/video/asset_traits.h
+++ b/include/nbl/video/asset_traits.h
@@ -193,7 +193,7 @@ struct asset_traits<asset::ICPUBottomLevelAccelerationStructure>
 	// the asset type
 	using asset_t = asset::ICPUBottomLevelAccelerationStructure;
 	// we don't need to descend during DFS into other assets
-	constexpr static inline bool HasChildren = true;
+	constexpr static inline bool HasChildren = false;
 	// the video type
 	using video_t = IGPUBottomLevelAccelerationStructure;
 	// lookup type
diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h
index ecec442366..2fdfe28e3c 100644
--- a/include/nbl/video/declarations.h
+++ b/include/nbl/video/declarations.h
@@ -34,7 +34,6 @@
 #include "nbl/video/utilities/CDrawIndirectAllocator.h"
 #include "nbl/video/utilities/CSubpassKiln.h"
 #include "nbl/video/utilities/IUtilities.h"
-#include "nbl/video/utilities/IGPUObjectFromAssetConverter.h"
 #include "nbl/video/utilities/SPhysicalDeviceFilter.h"
 #include "nbl/video/utilities/CSimpleResizeSurface.h"
 #include "nbl/video/utilities/CSmoothResizeSurface.h"
diff --git a/include/nbl/video/utilities/CAssetConverter.h b/include/nbl/video/utilities/CAssetConverter.h
index db61ee7857..935b79b1e5 100644
--- a/include/nbl/video/utilities/CAssetConverter.h
+++ b/include/nbl/video/utilities/CAssetConverter.h
@@ -159,7 +159,7 @@ class CAssetConverter : public core::IReferenceCounted
 				//! select build flags
 				uint8_t allowUpdate : 1 = false;
 				uint8_t allowCompaction : 1 = false;
-				BuildPreference preference : 2 = BuildPreference::Invalid;
+				BuildPreference preference : 2 = BuildPreference::None;
 				uint8_t lowMemory : 1 = false;
 				//! things that control the build
 				uint8_t hostBuild : 1 = false; // DO NOT USE, will get overriden to false anyway
@@ -171,7 +171,7 @@ class CAssetConverter : public core::IReferenceCounted
 				template<typename CRTP>
 				std::pair<bool,CRTP> combine_impl(const CRTP& _this, const CRTP& other) const
 				{
-					if (_this.preference!=other.preference || _this.preference==BuildPreference::Invalid)
+					if (_this.preference!=other.preference && _this.preference!=BuildPreference::None && other.preference!=BuildPreference::None)
 						return {false,_this};
 					CRTP retval = _this;
 					retval.isMotion |= other.isMotion;
@@ -887,6 +887,9 @@ class CAssetConverter : public core::IReferenceCounted
 			IGPUPipelineCache* pipelineCache = nullptr;
 			// optional, defaults to the device
 			IDeviceMemoryAllocator* allocator = nullptr;
+			// optional, defaults to worst case (Apple Silicon page size)
+			uint32_t scratchForDeviceASBuildMinAllocSize = 1<<14;
+			uint32_t scratchForHostASBuildMinAllocSize = 1<<14;
         };
 		// Split off from inputs because only assets that build on IPreHashed need uploading
 		struct SConvertParams
@@ -943,7 +946,8 @@ class CAssetConverter : public core::IReferenceCounted
 			uint32_t sampledImageBindingCount = 1<<10;
 			uint32_t storageImageBindingCount = 11<<10;
 			// specific to Acceleration Structure Build, they need to be at least as large as the largest amount of scratch required for an AS build
-			CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>* scratchForDeviceASBuild = nullptr;
+			using scratch_for_device_AS_build_t = CAsyncSingleBufferSubAllocatorST<core::GeneralpurposeAddressAllocator<uint64_t>>;
+			scratch_for_device_AS_build_t* scratchForDeviceASBuild = nullptr;
 			std::pmr::memory_resource* scratchForHostASBuild = nullptr;
 			// needs to service allocations without limit, unlike the above where failure will just force a flush and performance of already queued up builds
 			IDeviceMemoryAllocator* compactedASAllocator = nullptr;
@@ -957,7 +961,14 @@ class CAssetConverter : public core::IReferenceCounted
 
 			public:
 				template<asset::Asset AssetType>
-				using staging_cache_t = core::unordered_map<typename asset_traits<AssetType>::video_t*,typename CCache<AssetType>::key_t>;
+				struct staging_cache_key
+				{
+					core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t> gpuRef;
+					typename CCache<AssetType>::key_t cacheKey;
+				};
+				// it may seem weird storing both a smart pointer and a raw pointer, but the reason is to be able to drop a refcount while not loosing the key for lookup
+				template<asset::Asset AssetType>
+				using staging_cache_t = core::unordered_map<const typename asset_traits<AssetType>::video_t*,staging_cache_key<AssetType>>;
 
 				inline SReserveResult(SReserveResult&&) = default;
 				inline SReserveResult(const SReserveResult&) = delete;
@@ -987,7 +998,12 @@ class CAssetConverter : public core::IReferenceCounted
 					assert(m_minASBuildScratchSize[forHostOps]<=m_maxASBuildScratchSize[forHostOps]);
 					return m_maxASBuildScratchSize[forHostOps];
 				}
-// TODO: `getMinCompactedASAllocatorSpace`
+				// We do all compactions on the Device for simplicity
+				inline uint64_t getMinCompactedASAllocatorSpace() const
+				{
+					assert(m_compactedASMaxMemory == 0 || willDeviceASBuild() || willHostASBuild());
+					return m_compactedASMaxMemory;
+				}
 				// tells you if you need to provide a valid `SConvertParams::scratchForDeviceASBuild`
 				inline bool willDeviceASBuild() const {return getMinASBuildScratchSize(false)>0;}
 				// tells you if you need to provide a valid `SConvertParams::scratchForHostASBuild`
@@ -1000,8 +1016,7 @@ class CAssetConverter : public core::IReferenceCounted
 				// tells you if you need to provide a valid `SConvertParams::compactedASAllocator`
 				inline bool willCompactAS() const
 				{
-					assert(!m_willCompactSomeAS || willDeviceASBuild() || willHostASBuild());
-					return m_willCompactSomeAS;
+					return getMinCompactedASAllocatorSpace()!=0;
 				}
 
 				//
@@ -1044,21 +1059,10 @@ class CAssetConverter : public core::IReferenceCounted
 					return enqueueSuccess;
 				}
 
-				// public only because `GetDependantVisit<ICPUDescriptorSet>` needs it
-				struct SDeferredTLASWrite
-				{
-					inline bool operator==(const SDeferredTLASWrite& other) const
-					{
-						return dstSet == other.dstSet && binding == other.binding && arrayElement == other.arrayElement;
-					}
-
-					IGPUDescriptorSet* dstSet;
-					uint32_t binding;
-					uint32_t arrayElement;
-					core::smart_refctd_ptr<IGPUTopLevelAccelerationStructure> tlas;
-				};
 			private:
 				friend class CAssetConverter;
+				// internal classes
+				template<asset::Asset AssetType> friend class GetDependantVisit;
 
 				inline SReserveResult() = default;
 
@@ -1078,69 +1082,68 @@ class CAssetConverter : public core::IReferenceCounted
 				core::vector<core::smart_refctd_ptr<asset::IShader>> m_shaders;
 
 				// need a more explicit list of GPU objects that need device-assisted conversion
-				template<asset::Asset AssetType>
-				struct SConversionRequestBase
-				{
-					// canonical asset (the one that provides content)
-					core::smart_refctd_ptr<const AssetType> canonical;
-					// gpu object to transfer canonical's data to or build it from
-					asset_traits<AssetType>::video_t* gpuObj;
-				};
-				using SConvReqBuffer = SConversionRequestBase<asset::ICPUBuffer>;
-				core::vector<SConvReqBuffer> m_bufferConversions;
-				struct SConvReqImage : SConversionRequestBase<asset::ICPUImage>
+				core::unordered_map<IGPUBuffer*,core::smart_refctd_ptr<const asset::ICPUBuffer>> m_bufferConversions;
+				struct SConvReqImage
 				{
+					core::smart_refctd_ptr<const asset::ICPUImage> canonical = nullptr;
 					uint16_t recomputeMips = 0;
 				};
-				core::vector<SConvReqImage> m_imageConversions;
+				core::unordered_map<IGPUImage*,SConvReqImage> m_imageConversions;
 				template<typename CPUAccelerationStructure>
-				struct SConvReqAccelerationStructure : SConversionRequestBase<CPUAccelerationStructure>
+				struct SConvReqAccelerationStructure
 				{
-					constexpr static inline uint64_t WontCompact = (0x1ull<<48)-1;
-					inline bool compact() const {return compactedASWriteOffset!=WontCompact;}
-
 					using build_f = typename asset_traits<CPUAccelerationStructure>::video_t::BUILD_FLAGS;
 					inline void setBuildFlags(const build_f _flags) {buildFlags = static_cast<uint16_t>(_flags);}
 					inline build_f getBuildFlags() const {return static_cast<build_f>(buildFlags);}
 
-
-					uint64_t scratchSize;
-					uint64_t compactedASWriteOffset : 48 = WontCompact;
-					uint64_t buildFlags : 16 = static_cast<uint16_t>(build_f::NONE);
+					core::smart_refctd_ptr<const CPUAccelerationStructure> canonical = nullptr;
+					uint64_t scratchSize : 47 = 0;
+					uint64_t buildFlags : 16 = 0;
+					uint64_t compact : 1;
+					// scratch + input size also accounting for worst case padding due to alignment
+					uint64_t buildSize;
 				};
-				using SConvReqBLAS = SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>;
-				core::vector<SConvReqBLAS> m_blasConversions[2];
-				using SConvReqTLAS = SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>;
-				core::vector<SConvReqTLAS> m_tlasConversions[2];
+				using SConvReqBLASMap = core::unordered_map<IGPUBottomLevelAccelerationStructure*,SConvReqAccelerationStructure<asset::ICPUBottomLevelAccelerationStructure>>;
+				SConvReqBLASMap m_blasConversions[2];
+				struct SConvReqTLAS : SConvReqAccelerationStructure<asset::ICPUTopLevelAccelerationStructure>
+				{
+					// This tracks non-root BLASes which are needed for a subsequent TLAS build.
+					// Because the copy group ID of the BLAS can only depend on the copy group and pointer of the TLAS and BLAS,
+					// we can be sure that all instances of the same BLAS within a TLAS will have the same copy group ID and use a map instead of a vector for storage
+					// Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes prematurely.
+					using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>;
+					cpu_to_gpu_blas_map_t instanceMap;
+				};
+				using SConvReqTLASMap = core::unordered_map<IGPUTopLevelAccelerationStructure*,SConvReqTLAS>;
+				SConvReqTLASMap m_tlasConversions[2];
 
-				// 0 for device builds, 1 for host builds
+				// array index 0 for device builds, 1 for host builds
 				uint64_t m_minASBuildScratchSize[2] = {0,0};
 				uint64_t m_maxASBuildScratchSize[2] = {0,0};
-// TODO: make the compaction count the size
-				// We do all compactions on the Device for simplicity
-				uint8_t m_willCompactSomeAS : 1 = false;
-				// This tracks non-root BLASes which are needed for a subsequent TLAS build. Note that even things which are NOT in the staging cache are tracked here to make sure they don't finish their lifetimes early.
-				struct BLASUsedInTLASBuild
+				uint64_t m_compactedASMaxMemory = 0;
+				//
+				struct SDeferredTLASWrite
 				{
-					// This is the BLAS meant to be used for the instance, note that compaction of a BLAS overwrites the initial values at the end of `reserve`
-					core::smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure> gpuBLAS;
-					uint64_t buildDuringConvertCall : 1 = false;
-					// internal micro-refcount which lets us know when we should remove the entry from the map below
-					uint64_t remainingUsages : 63 = 0;
+					inline bool operator==(const SDeferredTLASWrite& other) const
+					{
+						return dstSet==other.dstSet && storageOffset.data==other.storageOffset.data;
+					}
+
+					IGPUDescriptorSet* dstSet;
+					// binding and array element rolled up into one
+					IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t storageOffset;
 				};
-				using cpu_to_gpu_blas_map_t = core::unordered_map<const asset::ICPUBottomLevelAccelerationStructure*,BLASUsedInTLASBuild>;
-				cpu_to_gpu_blas_map_t m_blasBuildMap;
 				struct SDeferredTLASWriteHasher
 				{
 					inline size_t operator()(const SDeferredTLASWrite& write) const
 					{
-						size_t retval = std::bit_cast<size_t>(write.dstSet);
-						core::hash_combine(retval,write.binding);
-						core::hash_combine(retval,write.arrayElement);
+						size_t retval = write.storageOffset.data;
+						core::hash_combine(retval,write.dstSet);
 						return retval;
 					}
 				};
-				core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher> m_deferredTLASDescriptorWrites;
+				using compacted_tlas_rewrite_set_t = core::unordered_set<SDeferredTLASWrite,SDeferredTLASWriteHasher>;
+				compacted_tlas_rewrite_set_t m_potentialTLASRewrites;
 
 				//
 				core::bitflag<IQueue::FAMILY_FLAGS> m_queueFlags = IQueue::FAMILY_FLAGS::NONE;
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index 9a02915187..66f6871dc6 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -67,7 +67,7 @@ class CComputeBlit : public core::IReferenceCounted
 			// required
 			CAssetConverter* converter;
 			// in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU
-			const asset::ICPUPipelineLayout* layout;
+			asset::ICPUPipelineLayout* layout;
 			// must be Uniform Texel Buffer descriptor type
 			hlsl::SBindingInfo kernelWeights;
 			// must be Sampled Image descriptor type
diff --git a/include/nbl/video/utilities/CSubpassKiln.h b/include/nbl/video/utilities/CSubpassKiln.h
index 7df6cc0caa..c41ec3dd7e 100644
--- a/include/nbl/video/utilities/CSubpassKiln.h
+++ b/include/nbl/video/utilities/CSubpassKiln.h
@@ -198,7 +198,7 @@ class CSubpassKiln
             if (begin==end)
                 return;
 
-            bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().indirectDrawCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end);
+            bake_impl(cmdbuf->getOriginDevice()->getPhysicalDevice()->getLimits().drawIndirectCount, drawIndirectBuffer, drawCountBuffer)(cmdbuf, begin, end);
         }
 
     protected:
diff --git a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h b/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
deleted file mode 100644
index 600197611b..0000000000
--- a/include/nbl/video/utilities/IGPUObjectFromAssetConverter.h
+++ /dev/null
@@ -1,331 +0,0 @@
-// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
-// This file is part of the "Nabla Engine".
-// For conditions of distribution and use, see copyright notice in nabla.h
-#ifndef _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_
-#define _NBL_VIDEO_I_GPU_OBJECT_FROM_ASSET_CONVERTER_H_INCLUDED_
-
-#include "nbl/core/declarations.h"
-#include "nbl/core/alloc/LinearAddressAllocator.h"
-
-#include "nbl/video/ISemaphore.h"
-#include "nbl/video/ILogicalDevice.h"
-
-#if 0
-auto IGPUObjectFromAssetConverter::create(const asset::ICPUAccelerationStructure** _begin, const asset::ICPUAccelerationStructure** _end, SParams& _params) -> created_gpu_object_array<asset::ICPUAccelerationStructure>
-{
-	const size_t assetCount = std::distance(_begin, _end);
-	auto res = core::make_refctd_dynamic_array<created_gpu_object_array<asset::ICPUAccelerationStructure> >(assetCount);
-	auto toCreateAndBuild = std::vector<const asset::ICPUAccelerationStructure*>();
-    auto buildRangeInfos = std::vector<IGPUAccelerationStructure::BuildRangeInfo*>();
-    toCreateAndBuild.reserve(assetCount);
-    buildRangeInfos.reserve(assetCount);
-    // Lambda function: creates the acceleration structure and It's buffer
-    auto allocateBufferAndCreateAccelerationStructure = [&](size_t asSize, const asset::ICPUAccelerationStructure* cpuas)
-    {
-        // Create buffer with cpuas->getAccelerationStructureSize
-        IGPUBuffer::SCreationParams gpuBufParams = {};
-        gpuBufParams.size = asSize;
-        gpuBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT;
-        auto gpubuf = _params.device->createBuffer(std::move(gpuBufParams));
-        auto mreqs = gpubuf->getMemoryReqs();
-        mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-        auto gpubufMem = _params.device->allocate(mreqs, gpubuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-        assert(gpubufMem.isValid());
-
-        // Create GPUAccelerationStructure with that buffer
-        IGPUAccelerationStructure::SCreationParams creatationParams = {};
-        creatationParams.bufferRange.buffer = gpubuf;
-        creatationParams.bufferRange.offset = 0;
-        creatationParams.bufferRange.size = asSize;
-        creatationParams.flags = cpuas->getCreationParameters().flags;
-        creatationParams.type = cpuas->getCreationParameters().type;
-        return _params.device->createAccelerationStructure(std::move(creatationParams));
-    };
-
-    for (ptrdiff_t i = 0u; i < assetCount; ++i)
-    {
-        const asset::ICPUAccelerationStructure* cpuas = _begin[i];
-
-        if(cpuas->hasBuildInfo())
-        {
-            // Add to toBuild vector of ICPUAccelerationStructure
-            toCreateAndBuild.push_back(cpuas);
-            buildRangeInfos.push_back(const_cast<IGPUAccelerationStructure::BuildRangeInfo*>(cpuas->getBuildRanges().begin()));
-        }
-        else if(cpuas->getAccelerationStructureSize() > 0)
-        {
-            res->operator[](i) = allocateBufferAndCreateAccelerationStructure(cpuas->getAccelerationStructureSize(), cpuas);
-        }
-    }
-
-    if(toCreateAndBuild.empty() == false)
-    {
-        bool hostBuildCommands = false; // get from SFeatures
-        if(hostBuildCommands)
-        {
-            _NBL_TODO();
-        }
-        else
-        {
-            core::vector<const asset::ICPUBuffer*> cpuBufferDeps;
-            constexpr uint32_t MaxGeometryPerBuildInfo = 16;
-            constexpr uint32_t MaxBuffersPerGeometry = 3; // TrianglesData ->  vertex+index+transformation
-            cpuBufferDeps.reserve(assetCount * MaxGeometryPerBuildInfo * MaxBuffersPerGeometry);
-
-            // Get CPUBuffer Dependencies
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
-            
-                auto buildInfo = cpuas->getBuildInfo();
-                assert(buildInfo != nullptr);
-
-                auto geoms = buildInfo->getGeometries().begin();
-                auto geomsCount = buildInfo->getGeometries().size();
-                if(geomsCount == 0)
-                {
-                    assert(false);
-                    continue;
-                }
-
-                for(uint32_t g = 0; g < geomsCount; ++g) 
-                {
-                    const auto& geom = geoms[g];
-                    if(geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
-                    {
-                        if(geom.data.triangles.indexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.indexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.vertexData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.vertexData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                        if(geom.data.triangles.transformData.isValid())
-                        {
-                            auto cpuBuf = geom.data.triangles.transformData.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_AABBS)
-                    {
-                        if(geom.data.aabbs.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.aabbs.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                    else if(geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
-                    {
-                        if(geom.data.instances.data.isValid())
-                        {
-                            auto cpuBuf = geom.data.instances.data.buffer.get();
-                            cpuBuf->addUsageFlags(core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT);
-                            cpuBufferDeps.push_back(cpuBuf);
-                        }
-                    }
-                }
-            }
-
-            // Convert CPUBuffer Deps to GPUBuffers
-            core::vector<size_t> redirs = eliminateDuplicatesAndGenRedirs(cpuBufferDeps);
-            auto gpuBufs = getGPUObjectsFromAssets<asset::ICPUBuffer>(cpuBufferDeps.data(), cpuBufferDeps.data()+cpuBufferDeps.size(), _params);
-            _params.waitForCreationToComplete();
-            _params.beginCommandBuffers();
-            size_t bufIter = 0ull;
-
-            // Fill buildGeomInfos partially (to later ge Get AS Size before build command)
-            std::vector<IGPUAccelerationStructure::DeviceBuildGeometryInfo> buildGeomInfos(toCreateAndBuild.size());
-     
-            using GPUGeometry = IGPUAccelerationStructure::Geometry<IGPUAccelerationStructure::DeviceAddressType>;
-            std::vector<GPUGeometry> gpuGeoms;
-            gpuGeoms.reserve(assetCount * MaxGeometryPerBuildInfo);
-
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = toCreateAndBuild[i];
-            
-                auto cpuBuildInfo = cpuas->getBuildInfo();
-                auto & gpuBuildInfo = buildGeomInfos[i];
-
-                gpuBuildInfo.type = cpuBuildInfo->type;
-                gpuBuildInfo.buildFlags = cpuBuildInfo->buildFlags;
-                gpuBuildInfo.buildMode = cpuBuildInfo->buildMode;
-                assert(cpuBuildInfo->buildMode == asset::IAccelerationStructure::EBM_BUILD);
-
-                // Fill Later:
-                gpuBuildInfo.srcAS = nullptr;
-                gpuBuildInfo.dstAS = nullptr;
-                gpuBuildInfo.scratchAddr = {};
-                
-                auto cpu_geoms = cpuBuildInfo->getGeometries().begin();
-                auto geomsCount = cpuBuildInfo->getGeometries().size();
-                if(geomsCount == 0)
-                {
-                    assert(false);
-                    continue;
-                }
-
-                size_t startGeom = gpuGeoms.size();
-                size_t endGeom = gpuGeoms.size() + geomsCount;
-
-                for(uint32_t g = 0; g < geomsCount; ++g) 
-                {
-                    const auto& cpu_geom = cpu_geoms[g];
-
-                    GPUGeometry gpu_geom = {};
-                    gpu_geom.type = cpu_geom.type;
-                    gpu_geom.flags = cpu_geom.flags;
-
-                    if(cpu_geom.type == asset::IAccelerationStructure::EGT_TRIANGLES)
-                    {
-                        gpu_geom.data.triangles.vertexFormat = cpu_geom.data.triangles.vertexFormat;
-                        gpu_geom.data.triangles.vertexStride = cpu_geom.data.triangles.vertexStride;
-                        gpu_geom.data.triangles.maxVertex = cpu_geom.data.triangles.maxVertex;
-                        gpu_geom.data.triangles.indexType = cpu_geom.data.triangles.indexType;
-
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.indexData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.indexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.indexData.offset;
-                        }
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.vertexData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.vertexData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.vertexData.offset;
-                        }
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.triangles.transformData.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.triangles.transformData.offset = gpubuf->getOffset() + cpu_geom.data.triangles.transformData.offset;
-                        }
-                    }
-                    else if(cpu_geom.type == asset::IAccelerationStructure::EGT_AABBS)
-                    {
-                        gpu_geom.data.aabbs.stride = cpu_geom.data.aabbs.stride;
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.aabbs.data.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.aabbs.data.offset = gpubuf->getOffset() + cpu_geom.data.aabbs.data.offset;
-                        }
-                    }
-                    else if(cpu_geom.type == asset::IAccelerationStructure::EGT_INSTANCES)
-                    {
-                        {
-                            IGPUOffsetBufferPair* gpubuf = (*gpuBufs)[redirs[bufIter++]].get();
-                            gpu_geom.data.instances.data.buffer = core::smart_refctd_ptr<IGPUBuffer>(gpubuf->getBuffer());
-                            gpu_geom.data.instances.data.offset = gpubuf->getOffset() + cpu_geom.data.instances.data.offset;
-                        }
-                    }
-
-                    gpuGeoms.push_back(gpu_geom);
-                }
-
-                gpuBuildInfo.geometries = core::SRange<GPUGeometry>(gpuGeoms.data() + startGeom, gpuGeoms.data() + endGeom);
-            }
-            
-            // Get SizeInfo for each CPUAS -> Create the AS -> Get Total Scratch Buffer Size 
-            std::vector<IGPUAccelerationStructure::BuildSizes> buildSizes(toCreateAndBuild.size());
-            uint64_t totalScratchBufferSize = 0ull;
-            uint64_t maxScratchBufferSize = 0ull;
-            for (ptrdiff_t i = 0u, toBuildIndex = 0u; i < assetCount; ++i)
-            {
-                const asset::ICPUAccelerationStructure* cpuas = _begin[i];
-                if(cpuas->hasBuildInfo() == false)
-                {
-                    // Only those with buildInfo (index in toCreateAndBuild vector) will get passed
-                    continue;
-                }
-
-                assert(cpuas == toCreateAndBuild[toBuildIndex]);
-                assert(toBuildIndex < toCreateAndBuild.size());
-
-                auto buildRanges = cpuas->getBuildRanges().begin();
-                auto buildRangesCount = cpuas->getBuildRanges().size();
-
-                auto & gpuBuildInfo = buildGeomInfos[toBuildIndex];
-                
-                std::vector<uint32_t> maxPrimCount(buildRangesCount);
-                for(auto b = 0; b < buildRangesCount; b++)
-                  maxPrimCount[b] = buildRanges[b].primitiveCount;
-
-                auto buildSize = _params.device->getAccelerationStructureBuildSizes(gpuBuildInfo, maxPrimCount.data());
-                buildSizes[i] = buildSize;
-
-                auto gpuAS = allocateBufferAndCreateAccelerationStructure(buildSize.accelerationStructureSize, cpuas);
-                res->operator[](i) = gpuAS;
-
-                // complete the buildGeomInfos (now only thing left is to allocate and set scratchAddr.buffer)
-                buildGeomInfos[toBuildIndex].dstAS = gpuAS.get();
-                buildGeomInfos[toBuildIndex].scratchAddr.offset = totalScratchBufferSize;
-
-                totalScratchBufferSize += buildSize.buildScratchSize;
-                core::max(maxScratchBufferSize, buildSize.buildScratchSize); // maxScratchBufferSize has no use now (unless we changed this function to build 1 by 1 instead of batch builds or have some kind of memory limit?)
-                ++toBuildIndex;
-            }
-
-            // Allocate Scratch Buffer
-            IGPUBuffer::SCreationParams gpuScratchBufParams = {};
-            gpuScratchBufParams.size = totalScratchBufferSize;
-            gpuScratchBufParams.usage = core::bitflag(asset::IBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT) | asset::IBuffer::EUF_STORAGE_BUFFER_BIT; 
-            auto gpuScratchBuf = _params.device->createBuffer(std::move(gpuScratchBufParams));
-            auto mreqs = gpuScratchBuf->getMemoryReqs();
-            mreqs.memoryTypeBits &= _params.device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
-            auto gpuScratchBufMem = _params.device->allocate(mreqs, gpuScratchBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-
-
-            for (ptrdiff_t i = 0u; i < toCreateAndBuild.size(); ++i)
-            {
-                auto & gpuBuildInfo = buildGeomInfos[i];
-                gpuBuildInfo.scratchAddr.buffer = gpuScratchBuf;
-            }
-
-            // Record CommandBuffer for Building (We have Completed buildInfos + buildRanges for each CPUAS)
-            auto & fence = _params.fences[EQU_COMPUTE];
-            fence = _params.device->createFence(static_cast<IGPUFence::E_CREATE_FLAGS>(0));
-            core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf = _params.perQueue[EQU_COMPUTE].cmdbuf;
-
-            IQueue::SSubmitInfo submit;
-            {
-                submit.commandBufferCount = 1u;
-                submit.commandBuffers = &cmdbuf.get();
-                submit.waitSemaphoreCount = 0u;
-                submit.pWaitDstStageMask = nullptr;
-                submit.pWaitSemaphores = nullptr;
-                uint32_t waitSemaphoreCount = 0u;
-            }
-            
-            assert(cmdbuf->getState() == IGPUCommandBuffer::STATE::RECORDING);
-            cmdbuf->buildAccelerationStructures({buildGeomInfos.data(),buildGeomInfos.data()+buildGeomInfos.size()},buildRangeInfos.data());
-            cmdbuf->end();
-
-            // TODO for future to make this function more sophisticated: Compaction, MemoryLimit for Build
-
-            core::smart_refctd_ptr<IGPUSemaphore> sem;
-            
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                sem = _params.device->createSemaphore();
-
-            auto* sem_ptr = sem.get();
-            auto* fence_ptr = fence.get();
-
-            submit.signalSemaphoreCount = sem_ptr?1u:0u;
-            submit.pSignalSemaphores = sem_ptr?&sem_ptr:nullptr;
-
-            _params.perQueue[EQU_COMPUTE].queue->submit(1u, &submit, fence_ptr);
-            if (_params.perQueue[EQU_COMPUTE].semaphore)
-                _params.perQueue[EQU_COMPUTE].semaphore[0] = std::move(sem);
-        }
-    }
-
-    return res;
-}
-#endif
-
-#endif
diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h
index 09877b0d8f..00776ba01d 100644
--- a/include/nbl/video/utilities/IUtilities.h
+++ b/include/nbl/video/utilities/IUtilities.h
@@ -436,6 +436,18 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
             return updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,callback);
         }
 
+        //
+        class CMemcpyUpstreamingDataProducer final : public IUpstreamingDataProducer
+        {
+            public:
+                inline uint32_t operator()(void* dst, const size_t offsetInRange, const uint32_t blockSize) override
+                {
+                    memcpy(dst,reinterpret_cast<const uint8_t*>(data)+offsetInRange,blockSize);
+                    return blockSize;
+                }
+
+                const void* data;
+        };
         //! Copies `data` to stagingBuffer and Records the commands needed to copy the data from stagingBuffer to `bufferRange.buffer`.
         //! Returns same as `updateBufferRangeViaStagingBuffer` with a callback instead of a pointer, make sure to submit with `nextSubmit.popSubmit()` after this function returns.
         //! Parameters:
@@ -448,25 +460,9 @@ class NBL_API2 IUtilities : public core::IReferenceCounted
         template<typename IntendedSubmitInfo> requires std::is_same_v<std::decay_t<IntendedSubmitInfo>,SIntendedSubmitInfo>
         inline bool updateBufferRangeViaStagingBuffer(IntendedSubmitInfo&& nextSubmit, const asset::SBufferRange<IGPUBuffer>& bufferRange, const void* data)
         {
-            // We check the guarantees of our documentation with the asserts while we're at it
-#ifdef _NBL_DEBUG
-            size_t prevRangeEnd = 0;
-#endif
-
-            auto retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,wrapUpstreamingDataProducerLambda(
-                [&](void* dst, const size_t offsetInRange, const uint32_t blockSize) -> uint32_t
-                {
-#ifdef _NBL_DEBUG
-                    assert(offsetInRange==prevRangeEnd);
-                    prevRangeEnd = offsetInRange+blockSize;
-#endif
-                    memcpy(dst,reinterpret_cast<const uint8_t*>(data)+offsetInRange,blockSize);
-                    return blockSize;
-                }
-            ));
-#ifdef _NBL_DEBUG
-            assert(prevRangeEnd==bufferRange.size);
-#endif
+            CMemcpyUpstreamingDataProducer memcpyCb;
+            memcpyCb.data = data;
+            bool retval = updateBufferRangeViaStagingBuffer(nextSubmit,bufferRange,memcpyCb);
             return retval;
         }
 
diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt
index 8f0edb1056..2dddc74f77 100755
--- a/src/nbl/CMakeLists.txt
+++ b/src/nbl/CMakeLists.txt
@@ -162,7 +162,7 @@ set(NBL_ASSET_SOURCES
 	
 # Shaders
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVOptimizer.cpp
-	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVDebloater.cpp
+	${NBL_ROOT_PATH}/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/IShaderCompiler.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/CGLSLCompiler.cpp
 	${NBL_ROOT_PATH}/src/nbl/asset/utils/CHLSLCompiler.cpp
@@ -308,7 +308,7 @@ endif()
 
 set(COMMON_INCLUDE_DIRS
 	${THIRD_PARTY_SOURCE_DIR}/glm
-        ${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header
+	${THIRD_PARTY_SOURCE_DIR}/renderdoc # for renderdoc api header
 	${CMAKE_BINARY_DIR}/3rdparty/zlib #for dynamically generated zconf.h
 	$<TARGET_PROPERTY:png_static,BINARY_DIR> #for dynamically generated pnglibconf.h
 	$<TARGET_PROPERTY:jpeg-static,BINARY_DIR> #for dynamically generated jconfig.h
@@ -324,7 +324,6 @@ set(NBL_LIBRARY_CREATION_SOURCES
 	${NABLA_SRCS_COMMON}
 	${NABLA_HEADERS}
 	$<TARGET_OBJECTS:aesGladman>
-	$<TARGET_OBJECTS:bzip2>
 	$<TARGET_OBJECTS:lz4>
 	$<TARGET_OBJECTS:lzma>
 	$<TARGET_OBJECTS:spirv_cross>
@@ -351,7 +350,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 	target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES)
 endif()
 
-if(NBL_DYNAMIC_MSVC_RUNTIME)
+if(NBL_COMPILER_DYNAMIC_RUNTIME)
 	set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
 else()
 	set_property(TARGET Nabla PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
@@ -359,6 +358,13 @@ endif()
 
 target_compile_definitions(Nabla PRIVATE __NBL_BUILDING_NABLA__)
 
+target_link_options(Nabla INTERFACE # proxy to downstream targets
+	$<$<CXX_COMPILER_FRONTEND_VARIANT:MSVC>:
+		$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:/DELAYLOAD:$<TARGET_FILE_NAME:Nabla>>
+		/DELAYLOAD:dxcompiler.dll
+	>
+)
+
 if (ANDROID)
 	add_library(android_native_app_glue STATIC
     	${ANDROID_NDK_ROOT_PATH}/sources/android/native_app_glue/android_native_app_glue.c
@@ -391,6 +397,14 @@ if(_NBL_BUILD_DPL_)
 	target_link_libraries(Nabla INTERFACE tbb tbbmalloc tbbmalloc_proxy)
 endif()
 
+# bzip2
+if(NBL_STATIC_BUILD)
+	target_link_libraries(Nabla INTERFACE bz2_static)
+else()
+	target_link_libraries(Nabla PRIVATE bz2_static)
+endif()
+add_dependencies(Nabla bz2_static)
+
 # boost
 target_include_directories(Nabla PUBLIC "${BOOST_PREPROCESSOR_INCLUDE}")
 
diff --git a/src/nbl/asset/ICPUDescriptorSet.cpp b/src/nbl/asset/ICPUDescriptorSet.cpp
index 03724be1a2..7137edcba5 100644
--- a/src/nbl/asset/ICPUDescriptorSet.cpp
+++ b/src/nbl/asset/ICPUDescriptorSet.cpp
@@ -108,36 +108,4 @@ core::smart_refctd_ptr<IAsset> ICPUDescriptorSet::clone(uint32_t _depth) const
 	return cp;
 }
 
-IAsset* ICPUDescriptorSet::getDependant_impl(size_t ix)
-{
-	for (auto i=0u; i<static_cast<uint32_t>(IDescriptor::E_TYPE::ET_COUNT); i++)
-	if (m_descriptorInfos[i])
-	{
-		const auto size = m_descriptorInfos[i]->size();
-		if (ix<size)
-		{
-			auto* desc = m_descriptorInfos[i]->operator[](ix).desc.get();
-			if (desc)
-			switch (IDescriptor::GetTypeCategory(static_cast<IDescriptor::E_TYPE>(i)))
-			{
-				case IDescriptor::EC_BUFFER:
-					return static_cast<ICPUBuffer*>(desc);
-				case IDescriptor::EC_SAMPLER:
-					return static_cast<ICPUSampler*>(desc);
-				case IDescriptor::EC_IMAGE:
-					return static_cast<ICPUImageView*>(desc);
-				case IDescriptor::EC_BUFFER_VIEW:
-					return static_cast<ICPUBufferView*>(desc);
-				case IDescriptor::EC_ACCELERATION_STRUCTURE:
-					return static_cast<ICPUTopLevelAccelerationStructure*>(desc);
-				default:
-					break;
-			}
-			return nullptr;
-		}
-		else
-			ix -= size;
-	}
-	return nullptr;
-}
 }
\ No newline at end of file
diff --git a/src/nbl/asset/utils/CSPIRVIntrospector.cpp b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
index 8b43c676b7..4ac78066a7 100644
--- a/src/nbl/asset/utils/CSPIRVIntrospector.cpp
+++ b/src/nbl/asset/utils/CSPIRVIntrospector.cpp
@@ -3,6 +3,8 @@
 // For conditions of distribution and use, see copyright notice in nabla.h
 
 #include "nbl/asset/utils/CSPIRVIntrospector.h"
+
+#include "nbl/asset/ICPUPipeline.h"
 #include "nbl/asset/utils/spvUtils.h"
 
 #include "nbl_spirv_cross/spirv_parser.hpp"
@@ -106,15 +108,15 @@ static CSPIRVIntrospector::CStageIntrospectionData::VAR_TYPE spvcrossType2E_TYPE
     }
 }
 
-core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const IPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout/* = nullptr*/)
+core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximateComputePipelineFromIntrospection(const ICPUPipelineBase::SShaderSpecInfo& info, core::smart_refctd_ptr<ICPUPipelineLayout>&& layout/* = nullptr*/)
 {
-    if (info.stage!=IShader::E_SHADER_STAGE::ESS_COMPUTE || info.valid()==IPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO)
+    if (info.valid()==ICPUPipelineBase::SShaderSpecInfo::INVALID_SPEC_INFO)
         return nullptr;
 
     CStageIntrospectionData::SParams params;
     params.entryPoint = info.entryPoint;
     params.shader = core::smart_refctd_ptr<const IShader>(info.shader);
-    params.stage = info.stage;
+    params.stage = hlsl::ShaderStage::ESS_COMPUTE;
 
     auto introspection = introspect(params);
 
@@ -174,15 +176,13 @@ core::smart_refctd_ptr<ICPUComputePipeline> CSPIRVIntrospector::createApproximat
         layout = pplnIntrospectData->createApproximatePipelineLayoutFromIntrospection(introspection);
     }
 
-    ICPUComputePipeline::SCreationParams pplnCreationParams;
-    pplnCreationParams.layout = layout.get();
-    pplnCreationParams.shader = info;
-    pplnCreationParams.layout = layout.get();
-    return ICPUComputePipeline::create(pplnCreationParams);
+    auto pipeline = ICPUComputePipeline::create(layout.get());
+    pipeline->getSpecInfos(hlsl::ShaderStage::ESS_COMPUTE)[0] = info;
+    return pipeline;
 }
 
 // returns true if successfully added all the info to self, false if incompatible with what's already in our pipeline or incomplete (e.g. missing spec constants)
-NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const IPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants)
+NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRVIntrospector::CStageIntrospectionData* stageData, const ICPUPipelineBase::SShaderSpecInfo::spec_constant_map_t* specConstants)
 {
     if (!stageData)
         return false;
@@ -218,7 +218,7 @@ NBL_API2 bool CSPIRVIntrospector::CPipelineIntrospectionData::merge(const CSPIRV
                         if (specConstantFound == specConstants->end())
                             return false;
 
-                        descInfo.count = specConstantFound->second;
+                        descInfo.count = (specConstantFound->second.size() != 0);
                     }
                     else
                     {
diff --git a/src/nbl/asset/utils/ISPIRVDebloater.cpp b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
similarity index 90%
rename from src/nbl/asset/utils/ISPIRVDebloater.cpp
rename to src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
index f05e9d70f5..de78d2b162 100644
--- a/src/nbl/asset/utils/ISPIRVDebloater.cpp
+++ b/src/nbl/asset/utils/ISPIRVEntryPointTrimmer.cpp
@@ -1,4 +1,4 @@
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/asset/utils/ISPIRVOptimizer.h"
 #include "nbl_spirv_cross/spirv.hpp"
 
@@ -10,15 +10,14 @@ using namespace nbl::asset;
 
 static constexpr spv_target_env SPIRV_VERSION = spv_target_env::SPV_ENV_UNIVERSAL_1_6;
 
-ISPIRVDebloater::ISPIRVDebloater()
+ISPIRVEntryPointTrimmer::ISPIRVEntryPointTrimmer()
 {
     constexpr auto optimizationPasses = std::array{
-        ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
-        ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,
         ISPIRVOptimizer::EOP_DEAD_BRANCH_ELIM,
         ISPIRVOptimizer::EOP_ELIM_DEAD_FUNCTIONS,
         ISPIRVOptimizer::EOP_ELIM_DEAD_VARIABLES,
         ISPIRVOptimizer::EOP_ELIM_DEAD_CONSTANTS,
+        ISPIRVOptimizer::EOP_AGGRESSIVE_DCE,
         ISPIRVOptimizer::EOP_ELIM_DEAD_MEMBERS,
         ISPIRVOptimizer::EOP_TRIM_CAPABILITIES,
     };
@@ -78,7 +77,7 @@ static bool validate(const uint32_t* binary, uint32_t binarySize, nbl::system::l
     return core.Validate(binary, binarySize, validatorOptions);
 }
 
-ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger) const
+ISPIRVEntryPointTrimmer::Result ISPIRVEntryPointTrimmer::trim(const  ICPUBuffer* spirvBuffer, const core::set<EntryPoint>& entryPoints, system::logger_opt_ptr logger) const
 {
     const auto* spirv = static_cast<const uint32_t*>(spirvBuffer->getPointer());
     const auto spirvDwordCount = spirvBuffer->getSize() / 4;
@@ -134,7 +133,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
     std::vector<uint32_t> minimizedSpirv;
     core::unordered_set<uint32_t> removedEntryPointIds;
 
-    bool needDebloat = false;
+    bool needtrim = false;
     auto offset = HEADER_SIZE;
     auto parse_instruction = [](uint32_t instruction) -> std::tuple<uint32_t, uint32_t>
     {
@@ -185,16 +184,16 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
             foundEntryPoint += 1; // a valid spirv will have unique entry points, so this should works
         } else
         {
-            if (needDebloat == false)
+            if (needtrim == false)
             {
                 minimizedSpirv.reserve(spirvDwordCount);
                 minimizedSpirv.insert(minimizedSpirv.end(), spirv, spirv + curOffset);
-                needDebloat = true;
+                needtrim = true;
             }
             removedEntryPointIds.insert(curEntryPointId);
             continue;
         }
-        if (!needDebloat) continue;
+        if (!needtrim) continue;
         minimizedSpirv.insert(minimizedSpirv.end(), spirv + curOffset, spirv + offset);
     }
 
@@ -208,7 +207,7 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
         };
     }
 
-    if (!needDebloat)
+    if (!needtrim)
     {
         return {
             .spirv = nullptr,
@@ -236,22 +235,22 @@ ISPIRVDebloater::Result ISPIRVDebloater::debloat(const  ICPUBuffer* spirvBuffer,
 
     assert(validate(minimizedSpirv.data(), minimizedSpirv.size(), logger));
 
-    auto debloatedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger);
+    auto trimmedSpirv = m_optimizer->optimize(minimizedSpirv.data(), minimizedSpirv.size(), logger);
 
 #ifdef _NBL_DEBUG
     logger.log("Before stripping capabilities:", nbl::system::ILogger::ELL_DEBUG);
     printCapabilities(spirv, spirvDwordCount, logger);
     logger.log("\n", nbl::system::ILogger::ELL_DEBUG);
 
-    const auto* debloatedSpirvBuffer = static_cast<const uint32_t*>(debloatedSpirv->getPointer());
-    const auto debloatedSpirvDwordCount = debloatedSpirv->getSize() / 4;
+    const auto* trimmedSpirvBuffer = static_cast<const uint32_t*>(trimmedSpirv->getPointer());
+    const auto trimmedSpirvDwordCount = trimmedSpirv->getSize() / 4;
     logger.log("After stripping capabilities:", nbl::system::ILogger::ELL_DEBUG);
-    printCapabilities(debloatedSpirvBuffer, debloatedSpirvDwordCount, logger);
+    printCapabilities(trimmedSpirvBuffer, trimmedSpirvDwordCount, logger);
     logger.log("\n", nbl::system::ILogger::ELL_DEBUG);
 #endif
 
     return {
-      .spirv = std::move(debloatedSpirv),
+      .spirv = std::move(trimmedSpirv),
       .isSuccess = true,
     };
     
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 9333a0d3b4..a3d15744a7 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -330,6 +330,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/arithmetic_portability_impl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup/fft.hlsl")
+#subgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/ballot.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/subgroup2/arithmetic_portability_impl.hlsl")
 #shared header between C++ and HLSL
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/surface_transform.h")
 #workgroup
@@ -341,6 +345,13 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/fft.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/scratch_size.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shared_scan.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup/shuffle.hlsl")
+#workgroup2
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic_config.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/virtual_wg_size_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/items_per_invoc_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/impl/arithmetic_config_def.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/arithmetic.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/workgroup2/shared_scan.hlsl")
 #Extensions
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/SVertexAttributes.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/ext/FullScreenTriangle/default.vert.hlsl")
@@ -361,7 +372,9 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/anisotropi
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/loadable_image.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/mip_mapped.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/storable_image.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/generic_shared_data.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/fft.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts/accessors/workgroup_arithmetic.hlsl")
 #tgmath
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tgmath/impl.hlsl")
diff --git a/src/nbl/builtin/utils.cmake b/src/nbl/builtin/utils.cmake
index 05f9618203..0653ff97a2 100644
--- a/src/nbl/builtin/utils.cmake
+++ b/src/nbl/builtin/utils.cmake
@@ -39,7 +39,7 @@ endmacro()
 # _NAMESPACE_ is a C++ namespace builtin resources will be wrapped into
 # _OUTPUT_INCLUDE_SEARCH_DIRECTORY_ is an absolute path to output directory for builtin resources header files which will be a search directory for generated headers outputed to ${_OUTPUT_HEADER_DIRECTORY_}/${_NAMESPACE_PREFIX_} where namespace prefix is the namespace turned into a path
 # _OUTPUT_SOURCE_DIRECTORY_ is an absolute path to output directory for builtin resources source files
-# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_DYNAMIC_MSVC_RUNTIME which is not an argument of this function
+# _STATIC_ optional last argument is a bool, if true then add_library will use STATIC, SHARED otherwise. Pay attention that MSVC runtime is controlled by NBL_COMPILER_DYNAMIC_RUNTIME which is not an argument of this function
 #
 # As an example one could list a resource as following
 # LIST_BUILTIN_RESOURCE(SOME_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp")
@@ -208,12 +208,8 @@ function(ADD_CUSTOM_BUILTIN_RESOURCES _TARGET_NAME_ _BUNDLE_NAME_ _BUNDLE_SEARCH
 		"${_OUTPUT_HEADER_DIRECTORY_}"
 	)
 	set_target_properties(${_TARGET_NAME_} PROPERTIES CXX_STANDARD 20)
-	
-	if(NBL_DYNAMIC_MSVC_RUNTIME)
-		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL")
-	else()
-		set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
-	endif()
+
+	set_property(TARGET ${_TARGET_NAME_} PROPERTY MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>$<$<BOOL:${NBL_COMPILER_DYNAMIC_RUNTIME}>:DLL>")
 	
 	set(NBL_BUILTIN_RESOURCES ${NBL_BUILTIN_RESOURCES}) # turn builtin resources paths list into variable
 	
diff --git a/src/nbl/device/DeviceGen.py b/src/nbl/device/DeviceGen.py
index 288732de9b..9ad485fc84 100644
--- a/src/nbl/device/DeviceGen.py
+++ b/src/nbl/device/DeviceGen.py
@@ -562,7 +562,7 @@ def buildTraitsHeader(**params):
         res.append(emptyline)
 
     if 'enable_jit' in params and params['enable_jit']:
-        res.append("std::string jit_traits = R\"===(")
+        res.append("std::ostringstream oss;")
 
     buildTraitsHeaderHelper(
         res,
@@ -582,7 +582,7 @@ def buildTraitsHeader(**params):
     )
 
     if 'enable_jit' in params and params['enable_jit']:
-        res.append(")===\";")
+        res.append("std::string jit_traits = oss.str();")
 
     return res
 
diff --git a/src/nbl/device/gen.py b/src/nbl/device/gen.py
index b910d1aa8f..88174cb3c2 100644
--- a/src/nbl/device/gen.py
+++ b/src/nbl/device/gen.py
@@ -120,7 +120,7 @@
         args.jit_traits_output_path,
         buildTraitsHeader,
         type="JIT Members",
-        template="NBL_CONSTEXPR_STATIC_INLINE {} {} = )===\" + std::string(\"({})\") + CJITIncludeLoader::to_string({}.{}) + R\"===(;",
+        template="oss << \"NBL_CONSTEXPR_STATIC_INLINE {} {} = ({})\" + CJITIncludeLoader::to_string({}.{}) << \";\\n\";",
         limits_json=limits,
         features_json=features,
         format_params=["type", "name", "type", "json_type", "cpp_name"],
diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp
index b40c7155be..f477e96cdf 100644
--- a/src/nbl/ext/ImGui/ImGui.cpp
+++ b/src/nbl/ext/ImGui/ImGui.cpp
@@ -342,17 +342,13 @@ core::smart_refctd_ptr<video::IGPUGraphicsPipeline> UI::createPipeline(SCreation
 
 	core::smart_refctd_ptr<video::IGPUGraphicsPipeline> pipeline;
 	{
-		const IPipelineBase::SShaderSpecInfo specs[] =
-		{
-			{.shader = shaders.vertex.get(), .entryPoint = "VSMain", .stage = hlsl::ShaderStage::ESS_VERTEX},
-			{.shader = shaders.fragment.get(), .entryPoint = "PSMain", .stage = hlsl::ShaderStage::ESS_FRAGMENT}
-		};
 
 		IGPUGraphicsPipeline::SCreationParams params[1];
 		{
 			auto& param = params[0u];
+			param.vertexShader = { .shader = shaders.vertex.get(), .entryPoint = "VSMain" };
+			param.fragmentShader = { .shader = shaders.fragment.get(), .entryPoint = "PSMain" };
 			param.layout = pipelineLayout.get();
-			param.shaders = specs;
 			param.renderpass = creationParams.renderpass.get();
 			param.cached = { .vertexInput = vertexInputParams, .primitiveAssembly = primitiveAssemblyParams, .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = creationParams.subpassIx };
 		};
diff --git a/src/nbl/video/CJITIncludeLoader.cpp b/src/nbl/video/CJITIncludeLoader.cpp
index edab1c046a..1fcbcb0505 100644
--- a/src/nbl/video/CJITIncludeLoader.cpp
+++ b/src/nbl/video/CJITIncludeLoader.cpp
@@ -20,7 +20,6 @@ auto CJITIncludeLoader::getInclude(const system::path& searchPath, const std::st
 std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& limits, const SPhysicalDeviceFeatures& features)
 {
     #include "nbl/video/device_capabilities_traits_jit.h"
-
     std::string start = R"===(
         #ifndef _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_
         #define _NBL_BUILTIN_HLSL_JIT_DEVICE_CAPABILITIES_INCLUDED_
@@ -49,4 +48,4 @@ std::string CJITIncludeLoader::collectDeviceCaps(const SPhysicalDeviceLimits& li
 
     return start + jit_traits + end;
 }
-}
\ No newline at end of file
+}
diff --git a/src/nbl/video/CVulkanAccelerationStructure.h b/src/nbl/video/CVulkanAccelerationStructure.h
index 6b94f9cad7..4c0d67eee1 100644
--- a/src/nbl/video/CVulkanAccelerationStructure.h
+++ b/src/nbl/video/CVulkanAccelerationStructure.h
@@ -54,21 +54,6 @@ class CVulkanTopLevelAccelerationStructure final : public CVulkanAccelerationStr
 		using Base::Base;
 };
 
-
-//! all these utilities cannot be nested because of the complex inheritance between `IGPUAccelerationStructure` and the Vulkan classes
-inline VkCopyAccelerationStructureModeKHR getVkCopyAccelerationStructureModeFrom(const IGPUAccelerationStructure::COPY_MODE in)
-{
-	return static_cast<VkCopyAccelerationStructureModeKHR>(in);
-}
-inline VkCopyAccelerationStructureInfoKHR getVkCopyAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyInfo& copyInfo)
-{
-	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
-	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.src->getNativeHandle());
-	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.dst->getNativeHandle());
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
-
 template<typename T>
 concept Buffer = is_any_of_v<std::remove_const_t<T>,IGPUBuffer,asset::ICPUBuffer>;
 
@@ -91,24 +76,6 @@ inline DeviceOrHostAddress<BufferType> getVkDeviceOrHostAddress(const asset::SBu
 	}
 	return addr;
 }
-template<Buffer BufferType>
-inline VkCopyAccelerationStructureToMemoryInfoKHR getVkCopyAccelerationStructureToMemoryInfoFrom(const IGPUAccelerationStructure::CopyToMemoryInfo<BufferType>& copyInfo)
-{
-	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
-	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.src->getNativeHandle());
-	info.dst = getVkDeviceOrHostAddress<BufferType>(copyInfo.dst);
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
-template<Buffer BufferType>
-inline VkCopyMemoryToAccelerationStructureInfoKHR getVkCopyMemoryToAccelerationStructureInfoFrom(const IGPUAccelerationStructure::CopyFromMemoryInfo<BufferType>& copyInfo)
-{
-	VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
-	info.src = getVkDeviceOrHostAddress<const BufferType>(copyInfo.src);
-	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(copyInfo.dst->getNativeHandle());
-	info.mode = getVkCopyAccelerationStructureModeFrom(copyInfo.mode);
-	return info;
-}
 
 inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerationStructure::GEOMETRY_FLAGS in)
 {
@@ -118,7 +85,7 @@ inline VkGeometryFlagsKHR getVkGeometryFlagsFrom(const IGPUBottomLevelAccelerati
 // The srcAccelerationStructure, dstAccelerationStructure, and mode members of pBuildInfo are ignored. Any VkDeviceOrHostAddressKHR members of pBuildInfo are ignored by this command
 static const VkDeviceOrHostAddressConstKHR NullAddress = { 0x0ull };
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<const BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase)
 {
 	static const VkDeviceOrHostAddressConstKHR DummyNonNullAddress = { 0xdeadbeefBADC0FFEull };
 
@@ -129,7 +96,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	outBase.geometry.triangles.vertexStride = triangles.vertexStride;
 	outBase.geometry.triangles.maxVertex = triangles.maxVertex;
 	outBase.geometry.triangles.indexType = (triangles.indexType == asset::E_INDEX_TYPE::EIT_UNKNOWN) ? VK_INDEX_TYPE_NONE_KHR : static_cast<VkIndexType>(triangles.indexType);
-	outBase.geometry.triangles.indexData = QueryOnly ? NullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.indexData);
+	outBase.geometry.triangles.indexData = triangles.indexType==asset::E_INDEX_TYPE::EIT_UNKNOWN || QueryOnly ? NullAddress:getVkDeviceOrHostAddress<const BufferType>(triangles.indexData);
 	// except that the hostAddress member of VkAccelerationStructureGeometryTrianglesDataKHR::transformData will be examined to check if it is NULL.
 	if (!triangles.hasTransform())
 		outBase.geometry.triangles.transformData = NullAddress;
@@ -137,7 +104,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 		outBase.geometry.triangles.transformData = DummyNonNullAddress;
 	else
 	{
-		if constexpr (triangles.Host)
+		if constexpr (triangles.HostTransform)
 			outBase.geometry.triangles.transformData.hostAddress = &triangles.transform;
 		else
 			outBase.geometry.triangles.transformData = getVkDeviceOrHostAddress<const IGPUBuffer>(triangles.transform);
@@ -145,9 +112,9 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 	outBase.flags = getVkGeometryFlagsFrom(triangles.geometryFlags.value);
 }
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<const BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<BufferType>& triangles, VkAccelerationStructureGeometryKHR& outBase, VkAccelerationStructureGeometryMotionTrianglesDataNV* &p_vertexMotion)
 {
-	getVkASGeometryFrom<const BufferType>(triangles,outBase);
+	getVkASGeometryFrom<BufferType,QueryOnly>(triangles,outBase);
 	if (triangles.vertexData[1].buffer)
 	{
 		p_vertexMotion->sType = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_MOTION_TRIANGLES_DATA_NV;
@@ -158,7 +125,7 @@ void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::Triangles<c
 }
 
 template<Buffer BufferType, bool QueryOnly=false>
-void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs<const BufferType>& aabbs, VkAccelerationStructureGeometryKHR& outBase)
+void getVkASGeometryFrom(const IGPUBottomLevelAccelerationStructure::AABBs<BufferType>& aabbs, VkAccelerationStructureGeometryKHR& outBase)
 {
 	outBase = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR,nullptr,VK_GEOMETRY_TYPE_AABBS_KHR};
 	outBase.geometry.aabbs = {VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_AABBS_DATA_KHR,nullptr};
@@ -221,7 +188,7 @@ inline VkAccelerationStructureBuildGeometryInfoKHR getVkASBuildGeometryInfo(cons
 	for (auto j=0u; j<info.geometryCount; j++)
 	{
 		auto& vk_geom = *(p_vk_geometry++);
-		if (info.buildFlags.hasFlags(IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
+		if (info.buildFlags.hasFlags(asset::IBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 			getVkASGeometryFrom(info.aabbs[j],vk_geom);
 		else
 			getVkASGeometryFrom(info.triangles[j],vk_geom,p_vertexMotion);
diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp
index 40b30287ed..a55c3a1e7b 100644
--- a/src/nbl/video/CVulkanCommandBuffer.cpp
+++ b/src/nbl/video/CVulkanCommandBuffer.cpp
@@ -377,22 +377,31 @@ bool CVulkanCommandBuffer::copyImage_impl(const IGPUImage* const srcImage, const
 }
 
 
-bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact)
 {
-    const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo);
+	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+	info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR;
     getFunctionTable().vkCmdCopyAccelerationStructureKHR(m_cmdbuf,&info);
     return true;
 }
-bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst)
 {
-    const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo);
+	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = getVkDeviceOrHostAddress(dst);
+	info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR;
     getFunctionTable().vkCmdCopyAccelerationStructureToMemoryKHR(m_cmdbuf,&info);
     return true;
 }
 
-bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
+bool CVulkanCommandBuffer::copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst)
 {
-    const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo);
+    VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+    info.src = getVkDeviceOrHostAddress(src);
+    info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+    info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR;
     getFunctionTable().vkCmdCopyMemoryToAccelerationStructureKHR(m_cmdbuf,&info);
     return true;
 }
@@ -661,7 +670,7 @@ bool CVulkanCommandBuffer::beginRenderPass_impl(const SRenderpassBeginInfo& info
         .renderArea = info.renderArea,
         // Implicitly but could be optimizedif needed
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-00902
-        .clearValueCount = vk_clearValues.size()/sizeof(VkClearValue),
+        .clearValueCount = static_cast<uint32_t>(vk_clearValues.size()/sizeof(VkClearValue)),
         // Implicit
         // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkRenderPassBeginInfo.html#VUID-VkRenderPassBeginInfo-clearValueCount-04962
         .pClearValues = vk_clearValues.data()
diff --git a/src/nbl/video/CVulkanCommandBuffer.h b/src/nbl/video/CVulkanCommandBuffer.h
index 99b1c15644..9383585b23 100644
--- a/src/nbl/video/CVulkanCommandBuffer.h
+++ b/src/nbl/video/CVulkanCommandBuffer.h
@@ -177,9 +177,9 @@ class CVulkanCommandBuffer final : public IGPUCommandBuffer
             return true;
         }
 
-        bool copyAccelerationStructure_impl(const IGPUAccelerationStructure::CopyInfo& copyInfo) override;
-        bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo) override;
-        bool copyAccelerationStructureFromMemory_impl(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo) override;
+        bool copyAccelerationStructure_impl(const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact);
+        bool copyAccelerationStructureToMemory_impl(const IGPUAccelerationStructure* src, const asset::SBufferBinding<IGPUBuffer>& dst);
+        bool copyAccelerationStructureFromMemory_impl(const asset::SBufferBinding<const IGPUBuffer>& src, IGPUAccelerationStructure* dst);
 
         bool bindComputePipeline_impl(const IGPUComputePipeline* const pipeline) override;
         bool bindGraphicsPipeline_impl(const IGPUGraphicsPipeline* const pipeline) override;
diff --git a/src/nbl/video/CVulkanComputePipeline.h b/src/nbl/video/CVulkanComputePipeline.h
index 76fb346e30..89077f9a9a 100644
--- a/src/nbl/video/CVulkanComputePipeline.h
+++ b/src/nbl/video/CVulkanComputePipeline.h
@@ -15,10 +15,9 @@ class CVulkanComputePipeline final : public IGPUComputePipeline
 {
     public:
         CVulkanComputePipeline(
-            core::smart_refctd_ptr<const IGPUPipelineLayout>&& _layout,
-            const core::bitflag<SCreationParams::FLAGS> _flags,
+            const SCreationParams& params,
             const VkPipeline pipeline
-        ) : IGPUComputePipeline(std::move(_layout),_flags), m_pipeline(pipeline) {}
+        ) : IGPUComputePipeline(params), m_pipeline(pipeline) {}
 
         inline const void* getNativeHandle() const override { return &m_pipeline; }
 
diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
index 2bec9e9d06..90b2993cb3 100644
--- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
+++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp
@@ -40,7 +40,7 @@ CVulkanDeviceMemoryBacked<Interface>::CVulkanDeviceMemoryBacked(
     assert(vkHandle!=VK_NULL_HANDLE);
 }
 
-template CVulkanDeviceMemoryBacked<IGPUBuffer>;
-template CVulkanDeviceMemoryBacked<IGPUImage>;
+template class CVulkanDeviceMemoryBacked<IGPUBuffer>;
+template class CVulkanDeviceMemoryBacked<IGPUImage>;
 
 }
\ No newline at end of file
diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp
index 792ab719eb..9494efc2f2 100644
--- a/src/nbl/video/CVulkanLogicalDevice.cpp
+++ b/src/nbl/video/CVulkanLogicalDevice.cpp
@@ -1,6 +1,6 @@
 #include "nbl/video/CVulkanLogicalDevice.h"
 
-#include "nbl/asset/utils/ISPIRVDebloater.h"
+#include "nbl/asset/utils/ISPIRVEntryPointTrimmer.h"
 #include "nbl/video/CThreadSafeQueueAdapter.h"
 #include "nbl/video/surface/CSurfaceVulkan.h"
 
@@ -498,21 +498,30 @@ bool CVulkanLogicalDevice::writeAccelerationStructuresProperties_impl(const std:
     return m_devf.vk.vkWriteAccelerationStructuresPropertiesKHR(m_vkdev,vk_accelerationStructures.size(),vk_accelerationStructures.data(),static_cast<VkQueryType>(type),stride*accelerationStructures.size(),data,stride);
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyAccelerationStructureInfoFrom(copyInfo);
+	VkCopyAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+	info.mode = compact ? VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR:VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyAccelerationStructureToMemoryInfoFrom(copyInfo);
+	VkCopyAccelerationStructureToMemoryInfoKHR info = { VK_STRUCTURE_TYPE_COPY_ACCELERATION_STRUCTURE_TO_MEMORY_INFO_KHR,nullptr };
+	info.src = *reinterpret_cast<const VkAccelerationStructureKHR*>(src->getNativeHandle());
+	info.dst = getVkDeviceOrHostAddress(dst);
+	info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyAccelerationStructureToMemoryKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
-auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) -> DEFERRABLE_RESULT
+auto CVulkanLogicalDevice::copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) -> DEFERRABLE_RESULT
 {
-    const auto info = getVkCopyMemoryToAccelerationStructureInfoFrom(copyInfo);
+    VkCopyMemoryToAccelerationStructureInfoKHR info = { VK_STRUCTURE_TYPE_COPY_MEMORY_TO_ACCELERATION_STRUCTURE_INFO_KHR,nullptr };
+    info.src = getVkDeviceOrHostAddress(src);
+    info.dst = *reinterpret_cast<const VkAccelerationStructureKHR*>(dst->getNativeHandle());
+    info.mode = VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR;
     return getDeferrableResultFrom(m_devf.vk.vkCopyMemoryToAccelerationStructureKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),&info));
 }
 
@@ -571,13 +580,13 @@ core::smart_refctd_ptr<IGPUDescriptorSetLayout> CVulkanLogicalDevice::createDesc
 
 core::smart_refctd_ptr<IGPUPipelineLayout> CVulkanLogicalDevice::createPipelineLayout_impl(
     const std::span<const asset::SPushConstantRange> pcRanges,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout0,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout1,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout2,
-    core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& layout3
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout0,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout1,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout2,
+    core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& layout3
 )
 {
-    const core::smart_refctd_ptr<IGPUDescriptorSetLayout> tmp[] = { layout0, layout1, layout2, layout3 };
+    const core::smart_refctd_ptr<const IGPUDescriptorSetLayout> tmp[] = { layout0, layout1, layout2, layout3 };
 
     VkDescriptorSetLayout vk_dsLayouts[asset::ICPUPipelineLayout::DESCRIPTOR_SET_COUNT];
     uint32_t nonNullSetLayoutCount = ~0u;
@@ -1035,7 +1044,9 @@ core::smart_refctd_ptr<IGPUFramebuffer> CVulkanLogicalDevice::createFramebuffer_
 
 // TODO: Change this to pass SPIR-V directly!
 VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
-    const asset::IPipelineBase::SShaderSpecInfo& specInfo,
+    const video::IGPUPipelineBase::SShaderSpecInfo& specInfo,
+    hlsl::ShaderStage stage,
+    bool requireFullSubgroups,
     VkShaderModuleCreateInfo* &outShaderModule,
     std::string* &outEntryPoints,
     VkPipelineShaderStageRequiredSubgroupSizeCreateInfo* &outRequiredSubgroupSize,
@@ -1054,8 +1065,6 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         // TODO: VkShaderModuleValidationCacheCreateInfoEXT from VK_EXT_validation_cache
         // TODO: VkPipelineRobustnessCreateInfoEXT from VK_EXT_pipeline_robustness (allows per-pipeline control of robustness)
 
-        const auto stage = specInfo.stage;
-
         (*outEntryPoints) = specInfo.entryPoint;
         const auto entryPointName = outEntryPoints->c_str();
         outEntryPoints++;
@@ -1076,8 +1085,8 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
             {
                 outSpecMapEntry->constantID = entry.first;
                 outSpecMapEntry->offset = std::distance<const uint8_t*>(specDataBegin,outSpecData);
-                outSpecMapEntry->size = entry.second.size;
-                memcpy(outSpecData,entry.second.data,outSpecMapEntry->size);
+                outSpecMapEntry->size = entry.second.size();
+                memcpy(outSpecData, entry.second.data(), outSpecMapEntry->size);
                 outSpecData += outSpecMapEntry->size;
                 outSpecMapEntry++;
             }
@@ -1098,7 +1107,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         outShaderModule++;
 
         // Implicit: https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02754
-        using subgroup_size_t = std::remove_reference_t<decltype(specInfo)>::SUBGROUP_SIZE;
+        using subgroup_size_t = asset::IPipelineBase::SUBGROUP_SIZE;
         if (specInfo.requiredSubgroupSize>=subgroup_size_t::REQUIRE_4)
         {
             *ppNext = outRequiredSubgroupSize;
@@ -1110,7 +1119,7 @@ VkPipelineShaderStageCreateInfo getVkShaderStageCreateInfoFrom(
         else
             retval.flags = 0;
 
-        if (specInfo.requireFullSubgroups)
+        if (requireFullSubgroups)
         {
             assert(stage==hlsl::ShaderStage::ESS_COMPUTE/*TODO: Or Mesh Or Task*/);
             retval.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT;
@@ -1141,7 +1150,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-    const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
     const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject():VK_NULL_HANDLE;
@@ -1168,7 +1177,7 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
     {
         initPipelineCreateInfo(outCreateInfo,info);
         const auto& spec = info.shader;
-        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
+        outCreateInfo->stage = getVkShaderStageCreateInfoFrom(spec, hlsl::ShaderStage::ESS_COMPUTE, info.cached.requireFullSubgroups, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
         outCreateInfo++;
     }
     auto vk_pipelines = reinterpret_cast<VkPipeline*>(output);
@@ -1182,12 +1191,11 @@ void CVulkanLogicalDevice::createComputePipelines_impl(
             // break the lifetime cause of the aliasing
             std::uninitialized_default_construct_n(output+i,1);
             output[i] = core::make_smart_refctd_ptr<CVulkanComputePipeline>(
-                core::smart_refctd_ptr<const IGPUPipelineLayout>(info.layout),
-                info.flags,vk_pipeline
+                info,vk_pipeline
             );
             debugNameBuilder.str("");
             const auto& specInfo = createInfos[i].shader;
-            debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << specInfo.stage << ")\n";
+            debugNameBuilder << specInfo.shader->getFilepathHint() << "(" << specInfo.entryPoint << "," << hlsl::ShaderStage::ESS_COMPUTE << ")\n";
         }
     }
     else
@@ -1198,7 +1206,7 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPUGraphicsPipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-    const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
     auto getVkStencilOpStateFrom = [](const asset::SStencilOpParams& params)->VkStencilOpState
@@ -1300,14 +1308,20 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     {
         initPipelineCreateInfo(outCreateInfo,info);
         outCreateInfo->pStages = outShaderStage;
-        for (const auto& spec : info.shaders)
+        auto processSpecShader = [&](IGPUPipelineBase::SShaderSpecInfo spec, hlsl::ShaderStage shaderStage)
         {
             if (spec.shader)
             {
-                *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
-                outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
+              *(outShaderStage++) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo, outSpecMapEntry, outSpecData);
+              outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage);
             }
-        }
+        };
+        processSpecShader(info.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        processSpecShader(info.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        processSpecShader(info.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        processSpecShader(info.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        processSpecShader(info.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
+
         // when dealing with mesh shaders, the vertex input and assembly state will be null
         {
             {
@@ -1342,17 +1356,13 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
             }
             outCreateInfo->pInputAssemblyState = outInputAssembly++;
         }
-        for (const auto& spec : info.shaders)
-        if (spec.shader)
+
+        if (info.tesselationControlShader.shader || info.tesselationEvaluationShader.shader)
         {
-            const auto stage = spec.stage;
-            if (stage==hlsl::ShaderStage::ESS_TESSELLATION_CONTROL || stage==hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION)
-            {
-                outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount;
-                outCreateInfo->pTessellationState = outTessellation++;
-                break;
-            }
+            outTessellation->patchControlPoints = info.cached.primitiveAssembly.tessPatchVertCount;
+            outCreateInfo->pTessellationState = outTessellation++;
         }
+
         const auto& raster = info.cached.rasterization;
         {
             outViewport->viewportCount = raster.viewportCount;
@@ -1432,16 +1442,22 @@ void CVulkanLogicalDevice::createGraphicsPipelines_impl(
     {
         for (size_t i=0ull; i<createInfos.size(); ++i)
         {
+            const auto& createInfo = createInfos[i];
             const VkPipeline vk_pipeline = vk_pipelines[i];
             // break the lifetime cause of the aliasing
             std::uninitialized_default_construct_n(output+i,1);
             output[i] = core::make_smart_refctd_ptr<CVulkanGraphicsPipeline>(createInfos[i],vk_pipeline);
             debugNameBuilder.str("");
-            for (const auto& shader: createInfos[i].shaders)
+            auto buildDebugName = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage stage)
             {
-                if (shader.shader != nullptr)
-                  debugNameBuilder <<shader.shader->getFilepathHint() << "(" << shader.entryPoint << "," << shader.stage << ")\n";
-            }
+                if (spec.shader != nullptr)
+                  debugNameBuilder <<spec.shader->getFilepathHint() << "(" << spec.entryPoint << "," << stage << ")\n";
+            };
+            buildDebugName(createInfo.vertexShader, hlsl::ESS_VERTEX);
+            buildDebugName(createInfo.tesselationControlShader, hlsl::ESS_TESSELLATION_CONTROL);
+            buildDebugName(createInfo.tesselationEvaluationShader, hlsl::ESS_TESSELLATION_EVALUATION);
+            buildDebugName(createInfo.geometryShader, hlsl::ESS_GEOMETRY);
+            buildDebugName(createInfo.fragmentShader, hlsl::ESS_FRAGMENT);
             output[i]->setObjectDebugName(debugNameBuilder.str().c_str());
         }
     }
@@ -1453,12 +1469,11 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     IGPUPipelineCache* const pipelineCache,
     const std::span<const IGPURayTracingPipeline::SCreationParams> createInfos,
     core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-    const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+    const SSpecializationValidationResult& validation
 )
 {
-    using SShaderGroupParams = asset::IRayTracingPipelineBase::SShaderGroupsParams;
-    using SGeneralShaderGroup = asset::IRayTracingPipelineBase::SGeneralShaderGroup;
-    using SHitShaderGroup = asset::IRayTracingPipelineBase::SHitShaderGroup;
+    using SShaderGroupParams = IGPURayTracingPipeline::SCreationParams::SShaderGroupsParams;
+    using SHitShaderGroup = IGPURayTracingPipeline::SHitGroup;
 
     const auto dynamicStates = std::array{ VK_DYNAMIC_STATE_RAY_TRACING_PIPELINE_STACK_SIZE_KHR };
     const VkPipelineDynamicStateCreateInfo vk_dynamicStateCreateInfo = { 
@@ -1471,9 +1486,44 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
 
     const VkPipelineCache vk_pipelineCache = pipelineCache ? static_cast<const CVulkanPipelineCache*>(pipelineCache)->getInternalObject():VK_NULL_HANDLE;
     
+    struct ShaderModuleKey
+    {
+        const asset::IShader* shader;
+        std::string_view entryPoint;
+        bool operator==(const ShaderModuleKey& other) const = default;
+
+        struct HashFunction
+        {
+            size_t operator()(const ShaderModuleKey& key) const
+            {
+                size_t rowHash = std::hash<const asset::IShader*>()(key.shader);
+                size_t colHash = std::hash<std::string_view>()(key.entryPoint) << 1;
+                return rowHash ^ colHash;
+            }
+        };
+    };
     size_t maxShaderStages = 0;
     for (const auto& info : createInfos)
-        maxShaderStages += info.shaders.size();
+    {
+        core::unordered_set<ShaderModuleKey, ShaderModuleKey::HashFunction> shaderModules;
+        shaderModules.insert({ info.shaderGroups.raygen.shader, info.shaderGroups.raygen.entryPoint });
+        for (const auto& miss : info.shaderGroups.misses)
+        {
+            shaderModules.insert({ miss.shader, miss.entryPoint });
+        }
+        for (const auto& hit : info.shaderGroups.hits)
+        {
+            shaderModules.insert({ hit.closestHit.shader, hit.closestHit.entryPoint });
+            shaderModules.insert({ hit.anyHit.shader, hit.anyHit.entryPoint });
+            shaderModules.insert({ hit.intersection.shader, hit.intersection.entryPoint });
+        }
+        for (const auto& callable : info.shaderGroups.callables)
+        {
+            shaderModules.insert({ callable.shader, callable.entryPoint });
+        }
+
+        maxShaderStages += shaderModules.size();
+    }
     size_t maxShaderGroups = 0;
     for (const auto& info : createInfos)
         maxShaderGroups += info.shaderGroups.getShaderGroupCount();
@@ -1498,52 +1548,85 @@ void CVulkanLogicalDevice::createRayTracingPipelines_impl(
     auto outSpecInfo = vk_specializationInfos.data();
     auto outSpecMapEntry = vk_specializationMapEntry.data();
     auto outSpecData = specializationData.data();
-    auto getVkShaderIndex = [](uint32_t index) { return index == SShaderGroupParams::SIndex::Unused ? VK_SHADER_UNUSED_KHR : index;  };
-    auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SGeneralShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
+
+    for (const auto& info : createInfos)
     {
-        return {
-            .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
-            .pNext = nullptr,
-            .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
-            .generalShader = getVkShaderIndex(group.index),
-            .closestHitShader = VK_SHADER_UNUSED_KHR,
-            .anyHitShader = VK_SHADER_UNUSED_KHR,
-            .intersectionShader = VK_SHADER_UNUSED_KHR,
+
+        core::unordered_map<ShaderModuleKey, uint32_t, ShaderModuleKey::HashFunction> shaderIndexes;
+        auto getVkShaderIndex = [&](const IGPUPipelineBase::SShaderSpecInfo& spec)
+        {
+          const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint };
+          const auto index = key.shader == nullptr ? VK_SHADER_UNUSED_KHR : shaderIndexes[key];
+          return index;
         };
-    };
-    auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
-    {
-        return  {
-            .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
-            .pNext = nullptr,
-            .type = group.intersection == SShaderGroupParams::SIndex::Unused ? 
-              VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR,
-            .generalShader = VK_SHADER_UNUSED_KHR,
-            .closestHitShader = getVkShaderIndex(group.closestHit),
-            .anyHitShader = getVkShaderIndex(group.anyHit),
-            .intersectionShader = getVkShaderIndex(group.intersection),
+
+        auto getGeneralVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](IGPUPipelineBase::SShaderSpecInfo spec) -> VkRayTracingShaderGroupCreateInfoKHR
+        {
+            return {
+                .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
+                .pNext = nullptr,
+                .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
+                .generalShader = getVkShaderIndex({spec.shader, spec.entryPoint}),
+                .closestHitShader = VK_SHADER_UNUSED_KHR,
+                .anyHitShader = VK_SHADER_UNUSED_KHR,
+                .intersectionShader = VK_SHADER_UNUSED_KHR,
+            };
         };
-    };
-    for (const auto& info : createInfos)
-    {
+        auto getHitVkRayTracingShaderGroupCreateInfo = [getVkShaderIndex](SHitShaderGroup group) -> VkRayTracingShaderGroupCreateInfoKHR
+        {
+            return  {
+                .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
+                .pNext = nullptr,
+                .type = group.intersection.shader == nullptr ? 
+                  VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR : VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR,
+                .generalShader = VK_SHADER_UNUSED_KHR,
+                .closestHitShader = getVkShaderIndex(group.closestHit),
+                .anyHitShader = getVkShaderIndex(group.anyHit),
+                .intersectionShader = getVkShaderIndex(group.intersection),
+            };
+        };
+
         initPipelineCreateInfo(outCreateInfo,info);
         outCreateInfo->pStages = outShaderStage;
-        for (const auto& specInfo : info.shaders)
+        auto processSpecInfo = [&](const IGPUPipelineBase::SShaderSpecInfo& spec, hlsl::ShaderStage shaderStage)
         {
-            *(outShaderStage++) = getVkShaderStageCreateInfoFrom(specInfo, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
-        }
-        outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages,outShaderStage);
-        assert(outCreateInfo->stageCount != 0);
+            if (!spec.shader) return;
+            const auto key = ShaderModuleKey{ spec.shader, spec.entryPoint };
+            if (shaderIndexes.find(key) == shaderIndexes.end())
+            {
+                shaderIndexes.insert({ key , std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages, outShaderStage)});
+                *(outShaderStage) = getVkShaderStageCreateInfoFrom(spec, shaderStage, false, outShaderModule, outEntryPoints, outRequiredSubgroupSize, outSpecInfo,outSpecMapEntry,outSpecData);
+                outShaderStage++;
+            }
+        };
 
         const auto& shaderGroups = info.shaderGroups;
         outCreateInfo->pGroups = outShaderGroup;
+        processSpecInfo(info.shaderGroups.raygen, hlsl::ESS_RAYGEN);
         *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroups.raygen);
+
         for (const auto& shaderGroup : shaderGroups.misses)
+        {
+            processSpecInfo(shaderGroup, hlsl::ESS_MISS);
             *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
         for (const auto& shaderGroup : shaderGroups.hits)
+        {
+            processSpecInfo(shaderGroup.closestHit, hlsl::ESS_CLOSEST_HIT);
+            processSpecInfo(shaderGroup.anyHit, hlsl::ESS_ANY_HIT);
+            processSpecInfo(shaderGroup.intersection, hlsl::ESS_INTERSECTION);
             *(outShaderGroup++) = getHitVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
         for (const auto& shaderGroup : shaderGroups.callables)
+        {
+            processSpecInfo(shaderGroup, hlsl::ESS_CALLABLE);
             *(outShaderGroup++) = getGeneralVkRayTracingShaderGroupCreateInfo(shaderGroup);
+        }
+
+        outCreateInfo->stageCount = std::distance<decltype(outCreateInfo->pStages)>(outCreateInfo->pStages,outShaderStage);
+        assert(outCreateInfo->stageCount != 0);
         outCreateInfo->groupCount = 1 + shaderGroups.hits.size() + shaderGroups.misses.size() + shaderGroups.callables.size();
         outCreateInfo->maxPipelineRayRecursionDepth = info.cached.maxRecursionDepth;
         if (info.cached.dynamicStackSize)
diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h
index 93d45dcc32..4cc633ec55 100644
--- a/src/nbl/video/CVulkanLogicalDevice.h
+++ b/src/nbl/video/CVulkanLogicalDevice.h
@@ -133,57 +133,53 @@ class CVulkanLogicalDevice final : public ILogicalDevice
 
         // acceleration structure modifiers
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::AABBs<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
-            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<asset::ICPUBuffer>> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const override
         {
-            return getAccelerationStructureBuildSizes_impl_impl_impl(flags,motionBlur,geometries,pMaxPrimitiveCounts);
+            return getAccelerationStructureBuildSizes_impl_impl_impl(hostBuild,flags,motionBlur,geometries,pMaxPrimitiveCounts);
         }
         template<class Geometry>
         inline AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl_impl_impl(
-            const core::bitflag<IGPUBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
+            const bool hostBuild, const core::bitflag<asset::IBottomLevelAccelerationStructure::BUILD_FLAGS> flags, const bool motionBlur,
             const std::span<const Geometry> geometries, const uint32_t* const pMaxPrimitiveCounts
         ) const
         {
-            constexpr bool IsAABB = std::is_same_v<Geometry,IGPUBottomLevelAccelerationStructure::AABBs<const typename Geometry::buffer_t>>;
+            constexpr bool IsTriangle = Geometry::Type==asset::IBottomLevelAccelerationStructure::GeometryType::Triangles;
 
             core::vector<VkAccelerationStructureGeometryKHR> vk_geometries(geometries.size());
-            core::vector<VkAccelerationStructureGeometryMotionTrianglesDataNV> vk_triangleMotions(IsAABB ? 0u:geometries.size());
+            core::vector<VkAccelerationStructureGeometryMotionTrianglesDataNV> vk_triangleMotions(IsTriangle ? geometries.size():0u);
             auto outTriangleMotions = vk_triangleMotions.data();
             for (auto i=0u; i<geometries.size(); i++)
             {
-                if constexpr (IsAABB)
-                    getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i]);
-                else
+                if constexpr (IsTriangle)
                     getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i],outTriangleMotions);
+                else
+                    getVkASGeometryFrom<typename Geometry::buffer_t,true>(geometries[i],vk_geometries[i]);
             }
 
-            return getAccelerationStructureBuildSizes_impl_impl(
-                std::is_same_v<typename Geometry::buffer_t,asset::ICPUBuffer>,false,
-                getVkASBuildFlagsFrom<IGPUBottomLevelAccelerationStructure>(flags,motionBlur),
-                vk_geometries,pMaxPrimitiveCounts
-            );
+            return getAccelerationStructureBuildSizes_impl_impl(hostBuild,false,getVkASBuildFlagsFrom<IGPUBottomLevelAccelerationStructure>(flags,motionBlur),vk_geometries,pMaxPrimitiveCounts);
         }
 
         AccelerationStructureBuildSizes getAccelerationStructureBuildSizes_impl(
@@ -263,16 +259,16 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             return getDeferrableResultFrom(m_devf.vk.vkBuildAccelerationStructuresKHR(m_vkdev,static_cast<CVulkanDeferredOperation*>(deferredOperation)->getInternalObject(),infoCount,vk_buildGeomsInfos.data(),vk_ppBuildRangeInfos));
         }
         bool writeAccelerationStructuresProperties_impl(const std::span<const IGPUAccelerationStructure* const> accelerationStructures, const IQueryPool::TYPE type, size_t* data, const size_t stride) override;
-        DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::CopyInfo& copyInfo) override;
-        DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyToMemoryInfo& copyInfo) override;
-        DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure::HostCopyFromMemoryInfo& copyInfo) override;
+        DEFERRABLE_RESULT copyAccelerationStructure_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, IGPUAccelerationStructure* dst, const bool compact) override;
+        DEFERRABLE_RESULT copyAccelerationStructureToMemory_impl(IDeferredOperation* const deferredOperation, const IGPUAccelerationStructure* src, const asset::SBufferBinding<asset::ICPUBuffer>& dst) override;
+        DEFERRABLE_RESULT copyAccelerationStructureFromMemory_impl(IDeferredOperation* const deferredOperation, const asset::SBufferBinding<const asset::ICPUBuffer>& src, IGPUAccelerationStructure* dst) override;
 
         // layouts
         core::smart_refctd_ptr<IGPUDescriptorSetLayout> createDescriptorSetLayout_impl(const std::span<const IGPUDescriptorSetLayout::SBinding> bindings, const uint32_t maxSamplersCount) override;
         core::smart_refctd_ptr<IGPUPipelineLayout> createPipelineLayout_impl(
             const std::span<const asset::SPushConstantRange> pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3
         ) override;
 
         // descriptor sets
@@ -289,20 +285,20 @@ class CVulkanLogicalDevice final : public ILogicalDevice
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUComputePipeline::SCreationParams> createInfos,
             core::smart_refctd_ptr<IGPUComputePipeline>* const output,
-            const IGPUComputePipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
         void createGraphicsPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPUGraphicsPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPUGraphicsPipeline>* const output,
-            const IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
 
         void createRayTracingPipelines_impl(
             IGPUPipelineCache* const pipelineCache,
             const std::span<const IGPURayTracingPipeline::SCreationParams> params,
             core::smart_refctd_ptr<IGPURayTracingPipeline>* const output,
-            const IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult& validation
+            const SSpecializationValidationResult& validation
         ) override;
 
         // queries
diff --git a/src/nbl/video/CVulkanPipelineLayout.h b/src/nbl/video/CVulkanPipelineLayout.h
index d89d2a493c..ef46226fdb 100644
--- a/src/nbl/video/CVulkanPipelineLayout.h
+++ b/src/nbl/video/CVulkanPipelineLayout.h
@@ -15,8 +15,8 @@ class CVulkanPipelineLayout : public IGPUPipelineLayout
     public:
         CVulkanPipelineLayout(
             const ILogicalDevice* dev, const std::span<const asset::SPushConstantRange> _pcRanges,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout1,
-            core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<IGPUDescriptorSetLayout>&& _layout3,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout0, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout1,
+            core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout2, core::smart_refctd_ptr<const IGPUDescriptorSetLayout>&& _layout3,
             const VkPipelineLayout vk_layout
         ) : IGPUPipelineLayout(
                 core::smart_refctd_ptr<const ILogicalDevice>(dev),
diff --git a/src/nbl/video/CVulkanRayTracingPipeline.cpp b/src/nbl/video/CVulkanRayTracingPipeline.cpp
index a107d3bbed..960d78428a 100644
--- a/src/nbl/video/CVulkanRayTracingPipeline.cpp
+++ b/src/nbl/video/CVulkanRayTracingPipeline.cpp
@@ -15,17 +15,17 @@ namespace nbl::video
     ShaderGroupHandleContainer&& shaderGroupHandles) :
     IGPURayTracingPipeline(params),
     m_vkPipeline(vk_pipeline),
+    m_shaderGroupHandles(std::move(shaderGroupHandles)),
     m_missStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.misses.size())),
     m_hitGroupStackSizes(core::make_refctd_dynamic_array<HitGroupStackSizeContainer>(params.shaderGroups.hits.size())),
-    m_callableStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.hits.size())),
-    m_shaderGroupHandles(std::move(shaderGroupHandles))
+    m_callableStackSizes(core::make_refctd_dynamic_array<GeneralGroupStackSizeContainer>(params.shaderGroups.hits.size()))
   {
     const auto* vulkanDevice = static_cast<const CVulkanLogicalDevice*>(getOriginDevice());
     auto* vk = vulkanDevice->getFunctionTable();
 
-    auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, uint32_t shaderIx, VkShaderGroupShaderKHR shaderType) -> uint16_t
+    auto getVkShaderGroupStackSize = [&](uint32_t baseGroupIx, uint32_t shaderGroupIx, const asset::IShader* shader, VkShaderGroupShaderKHR shaderType) -> uint16_t
     {
-      if (shaderIx == SShaderGroupsParams::SIndex::Unused)
+      if (shader == nullptr)
         return 0;
 
       return vk->vk.vkGetRayTracingShaderGroupStackSizeKHR(
@@ -36,14 +36,17 @@ namespace nbl::video
       );
     };
 
-    m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.index, VK_SHADER_GROUP_SHADER_GENERAL_KHR);
+    m_callableGroupCount = params.shaderGroups.callables.size();
+    m_missGroupCount = params.shaderGroups.misses.size();
+    m_hitGroupCount = params.shaderGroups.hits.size();
+    m_raygenStackSize = getVkShaderGroupStackSize(getRaygenIndex(), 0, params.shaderGroups.raygen.shader, VK_SHADER_GROUP_SHADER_GENERAL_KHR);
 
     for (size_t shaderGroupIx = 0; shaderGroupIx < params.shaderGroups.misses.size(); shaderGroupIx++)
     {
       m_missStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize(
         getMissBaseIndex(), 
         shaderGroupIx, 
-        params.shaderGroups.misses[shaderGroupIx].index,
+        params.shaderGroups.misses[shaderGroupIx].shader,
         VK_SHADER_GROUP_SHADER_GENERAL_KHR);
     }
 
@@ -52,9 +55,9 @@ namespace nbl::video
       const auto& hitGroup = params.shaderGroups.hits[shaderGroupIx];
       const auto baseIndex = getHitBaseIndex();
       m_hitGroupStackSizes->operator[](shaderGroupIx) = SHitGroupStackSize{
-        .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR),
-        .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR),
-        .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR),
+        .closestHit = getVkShaderGroupStackSize(baseIndex,shaderGroupIx, hitGroup.closestHit.shader, VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR),
+        .anyHit = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.anyHit.shader,VK_SHADER_GROUP_SHADER_ANY_HIT_KHR),
+        .intersection = getVkShaderGroupStackSize(baseIndex, shaderGroupIx, hitGroup.intersection.shader, VK_SHADER_GROUP_SHADER_INTERSECTION_KHR),
       };
     }
 
@@ -63,7 +66,7 @@ namespace nbl::video
       m_callableStackSizes->operator[](shaderGroupIx) = getVkShaderGroupStackSize(
         getCallableBaseIndex(), 
         shaderGroupIx, 
-        params.shaderGroups.callables[shaderGroupIx].index,
+        params.shaderGroups.callables[shaderGroupIx].shader,
         VK_SHADER_GROUP_SHADER_GENERAL_KHR);
     }
   }
@@ -83,19 +86,19 @@ namespace nbl::video
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getMissHandles() const
   {
     const auto baseIndex = getMissBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_missGroupCount);
   }
 
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getHitHandles() const
   {
     const auto baseIndex = getHitBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_hitGroupCount);
   }
 
   std::span<const IGPURayTracingPipeline::SShaderGroupHandle> CVulkanRayTracingPipeline::getCallableHandles() const
   {
     const auto baseIndex = getCallableBaseIndex();
-    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableShaderGroups->size());
+    return std::span(m_shaderGroupHandles->begin() + baseIndex, m_callableGroupCount);
   }
 
   uint16_t CVulkanRayTracingPipeline::getRaygenStackSize() const
@@ -159,13 +162,13 @@ namespace nbl::video
   uint32_t CVulkanRayTracingPipeline::getHitBaseIndex() const
   {
     // one raygen group + miss groups before this groups
-    return 1 + m_missShaderGroups->size();
+    return 1 + m_missGroupCount;
   }
 
   uint32_t CVulkanRayTracingPipeline::getCallableBaseIndex() const
   {
     // one raygen group + miss groups + hit groups before this groups
-    return 1 + m_missShaderGroups->size() + m_hitShaderGroups->size();
+    return 1 + m_missGroupCount + m_hitGroupCount;
   }
 
 }
diff --git a/src/nbl/video/IGPUAccelerationStructure.cpp b/src/nbl/video/IGPUAccelerationStructure.cpp
index b975742436..828ba309b8 100644
--- a/src/nbl/video/IGPUAccelerationStructure.cpp
+++ b/src/nbl/video/IGPUAccelerationStructure.cpp
@@ -5,7 +5,7 @@
 namespace nbl::video
 {
 
-template<class BufferType>
+template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 bool IGPUAccelerationStructure::BuildInfo<BufferType>::invalid(const IGPUAccelerationStructure* const src, const IGPUAccelerationStructure* const dst) const
 {
 	// https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#VUID-vkBuildAccelerationStructuresIndirectKHR-dstAccelerationStructure-03800
@@ -61,7 +61,7 @@ bool IGPUAccelerationStructure::BuildInfo<BufferType>::invalid(const IGPUAcceler
 //extern template class IGPUAccelerationStructure::BuildInfo<asset::ICPUBuffer>;
 
 
-template<class BufferType>
+template<class BufferType> requires (!std::is_const_v<BufferType> && std::is_base_of_v<asset::IBuffer,BufferType>)
 template<typename T>// requires nbl::is_any_of_v<T,std::conditional_t<std::is_same_v<BufferType,IGPUBuffer>,uint32_t,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>,IGPUBottomLevelAccelerationStructure::BuildRangeInfo>
 uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<BufferType>::valid(const T* const buildRangeInfosOrMaxPrimitiveCounts) const
 {
@@ -139,11 +139,11 @@ uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<BufferType>::valid(cons
 	retval += geometryCount*MaxBuffersPerGeometry;
 	return retval;
 }
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::template valid<uint32_t>(const uint32_t* const) const;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::template valid<uint32_t>(const uint32_t* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::valid<uint32_t>(const uint32_t* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::valid<uint32_t>(const uint32_t* const) const;
 using BuildRangeInfo = hlsl::acceleration_structures::bottom_level::BuildRangeInfo;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::template valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
-template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::template valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<IGPUBuffer>::valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
+template uint32_t IGPUBottomLevelAccelerationStructure::BuildInfo<asset::ICPUBuffer>::valid<BuildRangeInfo>(const BuildRangeInfo* const) const;
 
 bool IGPUBottomLevelAccelerationStructure::validVertexFormat(const asset::E_FORMAT format) const
 {
diff --git a/src/nbl/video/IGPUCommandBuffer.cpp b/src/nbl/video/IGPUCommandBuffer.cpp
index 3e776782fc..1f619666ab 100644
--- a/src/nbl/video/IGPUCommandBuffer.cpp
+++ b/src/nbl/video/IGPUCommandBuffer.cpp
@@ -235,8 +235,8 @@ bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<ResourceBarrier>
     #endif // _NBL_DEBUG
     return false;
 }
-template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<asset::SMemoryBarrier>&) const;
-template bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<IGPUCommandBuffer::SOwnershipTransferBarrier>&) const;
+template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<asset::SMemoryBarrier>&) const;
+template NBL_API2 bool IGPUCommandBuffer::invalidDependency(const SDependencyInfo<IGPUCommandBuffer::SOwnershipTransferBarrier>&) const;
 
 bool IGPUCommandBuffer::setEvent(IEvent* _event, const SEventDependencyInfo& depInfo)
 {
@@ -842,30 +842,27 @@ uint32_t IGPUCommandBuffer::buildAccelerationStructures_common(const std::span<c
         if constexpr (std::is_same_v<DeviceBuildInfo,IGPUTopLevelAccelerationStructure::DeviceBuildInfo>)
         {
             const auto blasCount = info.trackedBLASes.size();
-            if (blasCount)
-                m_TLASToBLASReferenceSets[info.dstAS] = {reinterpret_cast<const IGPUTopLevelAccelerationStructure::blas_smart_ptr_t*>(oit-blasCount),blasCount};
-            else
-                m_TLASToBLASReferenceSets[info.dstAS] = {};
+            m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit-blasCount,blasCount},.dst=info.dstAS});
         }
     }
 
     return totalGeometries;
 }
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const>(
     const std::span<const IGPUBottomLevelAccelerationStructure::DeviceBuildInfo>, IGPUBottomLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos>(
     const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo>, IGPUTopLevelAccelerationStructure::DirectBuildRangeRangeInfos, const IGPUBuffer* const
 );
-template uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::MaxInputCounts* const>(
+template NBL_API2 uint32_t IGPUCommandBuffer::buildAccelerationStructures_common<IGPUTopLevelAccelerationStructure::DeviceBuildInfo, IGPUTopLevelAccelerationStructure::MaxInputCounts* const>(
     const std::span<const IGPUTopLevelAccelerationStructure::DeviceBuildInfo>, IGPUTopLevelAccelerationStructure::MaxInputCounts* const, const IGPUBuffer* const
 );
 
-
-bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructure::CopyInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructure(const AccelerationStructure::CopyInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -888,10 +885,16 @@ bool IGPUCommandBuffer::copyAccelerationStructure(const IGPUAccelerationStructur
     }
 
     m_noCommands = false;
-    return copyAccelerationStructure_impl(copyInfo);
+    const bool retval = copyAccelerationStructure_impl(copyInfo.src,copyInfo.dst,copyInfo.compact);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+        m_TLASTrackingOps.emplace_back(TLASTrackingCopy{.src=copyInfo.src,.dst=copyInfo.dst});
+    return retval;
 }
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::CopyInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructure<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::CopyInfo&);
 
-bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const AccelerationStructure::DeviceCopyToMemoryInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -911,10 +914,16 @@ bool IGPUCommandBuffer::copyAccelerationStructureToMemory(const IGPUAcceleration
     }
 
     m_noCommands = false;
-    return copyAccelerationStructureToMemory_impl(copyInfo);
+    const bool retval = copyAccelerationStructureToMemory_impl(copyInfo.src,copyInfo.dst);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+        m_TLASTrackingOps.emplace_back(TLASTrackingRead{.src=copyInfo.src,.dst=copyInfo.trackedBLASes});
+    return retval;
 }
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureToMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyToMemoryInfo&);
 
-bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
+template<typename AccelerationStructure> requires std::is_base_of_v<IGPUAccelerationStructure,AccelerationStructure>
+bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const AccelerationStructure::DeviceCopyFromMemoryInfo& copyInfo)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::TRANSFER_BIT,RENDERPASS_SCOPE::OUTSIDE))
         return false;
@@ -934,8 +943,24 @@ bool IGPUCommandBuffer::copyAccelerationStructureFromMemory(const IGPUAccelerati
     }
 
     m_noCommands = false;
-    return copyAccelerationStructureFromMemory_impl(copyInfo);
+    const bool retval = copyAccelerationStructureFromMemory_impl(copyInfo.src,copyInfo.dst);
+    if constexpr (std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>)
+    {
+        const auto size = copyInfo.trackedBLASes.size();
+        auto oit = reserveReferences(size);
+        if (oit)
+        {
+            m_TLASTrackingOps.emplace_back(TLASTrackingWrite{.src={oit,size},.dst=copyInfo.dst});
+            for (const auto& blas : copyInfo.trackedBLASes)
+                *(oit++) = core::smart_refctd_ptr<const IReferenceCounted>(blas);
+        }
+        else
+            NBL_LOG_ERROR("out of host memory for BLAS tracking references, TLAS will be copied from memory without BLAS tracking data!");
+    }
+    return retval;
 }
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUBottomLevelAccelerationStructure>(const IGPUBottomLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
+template NBL_API2 bool IGPUCommandBuffer::copyAccelerationStructureFromMemory<IGPUTopLevelAccelerationStructure>(const IGPUTopLevelAccelerationStructure::DeviceCopyFromMemoryInfo&);
 
 
 bool IGPUCommandBuffer::bindComputePipeline(const IGPUComputePipeline* const pipeline)
@@ -1661,8 +1686,8 @@ bool IGPUCommandBuffer::invalidDrawIndirect(const asset::SBufferBinding<const IG
     }
     return false;
 }
-template bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
-template bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirect<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, uint32_t);
 
 template<typename IndirectCommand> requires nbl::is_any_of_v<IndirectCommand,hlsl::DrawArraysIndirectCommand_t,hlsl::DrawElementsIndirectCommand_t>
 bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding<const IGPUBuffer>& indirectBinding, const asset::SBufferBinding<const IGPUBuffer>& countBinding, const uint32_t maxDrawCount, const uint32_t stride)
@@ -1680,8 +1705,8 @@ bool IGPUCommandBuffer::invalidDrawIndirectCount(const asset::SBufferBinding<con
 
     return false;
 }
-template bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
-template bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawArraysIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
+template NBL_API2 bool IGPUCommandBuffer::invalidDrawIndirectCount<hlsl::DrawElementsIndirectCommand_t>(const asset::SBufferBinding<const IGPUBuffer>&, const asset::SBufferBinding<const IGPUBuffer>&, const uint32_t, const uint32_t);
 
 bool IGPUCommandBuffer::drawIndirect(const asset::SBufferBinding<const IGPUBuffer>& binding, const uint32_t drawCount, const uint32_t stride)
 {
@@ -2078,22 +2103,18 @@ bool IGPUCommandBuffer::executeCommands(const uint32_t count, IGPUCommandBuffer*
     return executeCommands_impl(count,cmdbufs);
 }
 
-bool IGPUCommandBuffer::recordReferences(const std::span<const IReferenceCounted*> refs)
+core::smart_refctd_ptr<const core::IReferenceCounted>* IGPUCommandBuffer::reserveReferences(const uint32_t size)
 {
     if (!checkStateBeforeRecording(queue_flags_t::COMPUTE_BIT|queue_flags_t::GRAPHICS_BIT|queue_flags_t::TRANSFER_BIT|queue_flags_t::SPARSE_BINDING_BIT))
-        return false;
+        return nullptr;
     
-    auto cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CCustomReferenceCmd>(m_commandList,refs.size());
+    auto cmd = m_cmdpool->m_commandListPool.emplace<IGPUCommandPool::CCustomReferenceCmd>(m_commandList,size);
     if (!cmd)
     {
         NBL_LOG_ERROR("out of host memory!");
-        return false;
+        return nullptr;
     }
-    auto oit = cmd->getVariableCountResources();
-    for (const auto& ref : refs)
-        *(oit++) = core::smart_refctd_ptr<const core::IReferenceCounted>(ref);
-
-    return true;
+    return cmd->getVariableCountResources();
 }
 
 }
\ No newline at end of file
diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp
index 26cfc4c6a8..983daed190 100644
--- a/src/nbl/video/ILogicalDevice.cpp
+++ b/src/nbl/video/ILogicalDevice.cpp
@@ -7,50 +7,61 @@
 using namespace nbl;
 using namespace nbl::video;
 
-static void debloatShaders(const asset::ISPIRVDebloater& debloater, std::span<const asset::IPipelineBase::SShaderSpecInfo> shaderSpecs, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders, asset::IPipelineBase::SShaderSpecInfo* outShaderSpecInfos, system::logger_opt_ptr logger = nullptr)
+class SpirvTrimTask
 {
-    using EntryPoints = core::set<asset::ISPIRVDebloater::EntryPoint>;
-    core::map<const asset::IShader*, EntryPoints> entryPointsMap;
-
-    // collect all entry points first before we debloat
-    for (const auto& shaderSpec : shaderSpecs) {
-        const auto* shader = shaderSpec.shader;
-        auto it = entryPointsMap.find(shader);
-        if (it == entryPointsMap.end() || it->first != shader)
-            it = entryPointsMap.emplace_hint(it, shader, EntryPoints());
-        it->second.insert({ .name = shaderSpec.entryPoint, .stage = shaderSpec.stage });
-    }
+    public:
+        using EntryPoints = core::set<asset::ISPIRVEntryPointTrimmer::EntryPoint>;
+        struct ShaderInfo
+        {
+            EntryPoints entryPoints;
+            const asset::IShader* trimmedShader;
+        };
 
-    core::map<const asset::IShader*, const asset::IShader*> debloatedShaders;
-    for (const auto& shaderSpec: shaderSpecs)
-    {
-        const auto* shader = shaderSpec.shader;
-        const auto& entryPoints = entryPointsMap[shader];
+        SpirvTrimTask(asset::ISPIRVEntryPointTrimmer* trimer, system::logger_opt_ptr logger) : m_trimmer(trimer), m_logger(logger)
+        {
+          
+        }
+
+        void insertEntryPoint(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, const hlsl::ShaderStage stage)
+        {
+            const auto* shader = shaderSpec.shader;
+            auto it = m_shaderInfoMap.find(shader);
+            if (it == m_shaderInfoMap.end() || it->first != shader)
+              it = m_shaderInfoMap.emplace_hint(it, shader, ShaderInfo{ EntryPoints(), nullptr } );
+            it->second.entryPoints.insert({ .name = shaderSpec.entryPoint, .stage = stage });
+        }
 
-        auto debloatedShaderSpec = shaderSpec;
-        if (shader != nullptr)
+        IGPUPipelineBase::SShaderSpecInfo trim(const IGPUPipelineBase::SShaderSpecInfo& shaderSpec, core::vector<core::smart_refctd_ptr<const asset::IShader>>& outShaders)
         {
-            if (!debloatedShaders.contains(shader))
+            const auto* shader = shaderSpec.shader;
+            auto findResult = m_shaderInfoMap.find(shader);
+            assert(findResult != m_shaderInfoMap.end());
+            const auto& entryPoints = findResult->second.entryPoints;
+            auto& trimmedShader = findResult->second.trimmedShader;
+
+            auto trimmedShaderSpec = shaderSpec;
+            if (shader != nullptr)
             {
-                const auto outShadersData = outShaders.data();
-                outShaders.push_back(debloater.debloat(shader, entryPoints, logger));
-                assert(outShadersData == outShaders.data());
-                debloatedShaders.emplace(shader, outShaders.back().get());
+                if (trimmedShader == nullptr)
+                {
+                    outShaders.push_back(m_trimmer->trim(shader, entryPoints, m_logger));
+                    trimmedShader = outShaders.back().get();
+                }
+                trimmedShaderSpec.shader = trimmedShader;
             }
-            const auto debloatedShader = debloatedShaders[shader];
-            debloatedShaderSpec.shader = debloatedShader;
+            return trimmedShaderSpec;
         }
-        *outShaderSpecInfos = debloatedShaderSpec;
-
-        outShaderSpecInfos++;
-    }
-
-}
+  
+    private:
+        core::map<const asset::IShader*, ShaderInfo> m_shaderInfoMap;
+        asset::ISPIRVEntryPointTrimmer* m_trimmer;
+        const system::logger_opt_ptr m_logger;
+};
 
 ILogicalDevice::ILogicalDevice(core::smart_refctd_ptr<const IAPIConnection>&& api, const IPhysicalDevice* const physicalDevice, const SCreationParams& params, const bool runningInRenderdoc)
     : m_api(api), m_physicalDevice(physicalDevice), m_enabledFeatures(params.featuresToEnable), m_compilerSet(params.compilerSet),
     m_logger(m_physicalDevice->getDebugCallback() ? m_physicalDevice->getDebugCallback()->getLogger() : nullptr),
-    m_spirvDebloater(core::make_smart_refctd_ptr<asset::ISPIRVDebloater>())
+    m_spirvTrimmer(core::make_smart_refctd_ptr<asset::ISPIRVEntryPointTrimmer>())
 {
     {
         uint32_t qcnt = 0u;
@@ -781,16 +792,8 @@ asset::ICPUPipelineCache::SCacheKey ILogicalDevice::getPipelineCacheKey() const
 bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCache, const std::span<const IGPUComputePipeline::SCreationParams> params, core::smart_refctd_ptr<IGPUComputePipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    IGPUComputePipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
-    {
-        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-pNext-02755
-        if (info.requiredSubgroupSize>=asset::IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE::REQUIRE_4 && !getPhysicalDeviceLimits().requiredSubgroupSizeStages.hasFlags(info.stage))
-        {
-            NBL_LOG_ERROR("Invalid shader stage");
-            return false;
-        }
-        return true;
-    });
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache, params);
+
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -798,17 +801,20 @@ bool ILogicalDevice::createComputePipelines(IGPUPipelineCache* const pipelineCac
     }
 
     core::vector<IGPUComputePipeline::SCreationParams> newParams(params.begin(), params.end());
-    const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
-    {
-        return sum + param.getShaders().size();
-    });
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
+    const auto shaderCount = params.size();
+    
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
 
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
-        debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, &newParams[ix].shader, m_logger);
+
+        const core::set entryPoints = { asset::ISPIRVEntryPointTrimmer::EntryPoint{.name = ci.shader.entryPoint, .stage = hlsl::ShaderStage::ESS_COMPUTE} };
+        trimmedShaders.push_back(m_spirvTrimmer->trim(ci.shader.shader, entryPoints, m_logger));
+        auto trimmedShaderSpec = ci.shader;
+        trimmedShaderSpec.shader = trimmedShaders.back().get();
+        newParams[ix].shader = trimmedShaderSpec;
     }
 
     createComputePipelines_impl(pipelineCache,newParams,output,specConstantValidation);
@@ -834,14 +840,7 @@ bool ILogicalDevice::createGraphicsPipelines(
 )
 {
     std::fill_n(output, params.size(), nullptr);
-    IGPUGraphicsPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params,
-        [this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
-        {
-            if (info.stage != hlsl::ShaderStage::ESS_VERTEX)
-                return true;
-            return info.shader;
-        }
-    );
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(nullptr, params);
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -853,17 +852,35 @@ bool ILogicalDevice::createGraphicsPipelines(
     core::vector<IGPUGraphicsPipeline::SCreationParams> newParams(params.begin(), params.end());
     const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {
-        return sum + param.getShaders().size();
+        return sum + param.getShaderCount();
     });
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
-
-    core::vector<asset::IPipelineBase::SShaderSpecInfo> debloatedShaderSpecs(shaderCount);
-    auto outShaderSpecs = debloatedShaderSpecs.data();
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+    trimmedShaders.reserve(shaderCount);
 
     for (auto ix = 0u; ix < params.size(); ix++)
     {
         const auto& ci = params[ix];
+
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00704
+        // https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkPipelineShaderStageCreateInfo.html#VUID-VkPipelineShaderStageCreateInfo-stage-00705
+        if (ci.tesselationControlShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationControlShader.shader);
+            return false;
+        }
+
+        if (ci.tesselationEvaluationShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Tessellation Shader feature not enabled!", ci.tesselationEvaluationShader.shader);
+            return false;
+        }
+
+        if (ci.geometryShader.shader)
+        {
+            NBL_LOG_ERROR("Cannot create IGPUShader for %p, Geometry Shader feature not enabled!", ci.geometryShader.shader);
+            return false;
+        }
+        
         auto renderpass = ci.renderpass;
         if (!renderpass->wasCreatedBy(this))
         {
@@ -953,9 +970,19 @@ bool ILogicalDevice::createGraphicsPipelines(
                 }
             }
         }
+
+        SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
+        trimTask.insertEntryPoint(ci.vertexShader, hlsl::ShaderStage::ESS_VERTEX);
+        trimTask.insertEntryPoint(ci.tesselationControlShader, hlsl::ShaderStage::ESS_TESSELLATION_CONTROL);
+        trimTask.insertEntryPoint(ci.tesselationEvaluationShader, hlsl::ShaderStage::ESS_TESSELLATION_EVALUATION);
+        trimTask.insertEntryPoint(ci.geometryShader, hlsl::ShaderStage::ESS_GEOMETRY);
+        trimTask.insertEntryPoint(ci.fragmentShader, hlsl::ShaderStage::ESS_FRAGMENT);
         
-        newParams[ix].shaders = std::span(outShaderSpecs, ci.getShaders().size());
-        debloatShaders(*m_spirvDebloater.get(), ci.getShaders(), debloatedShaders, outShaderSpecs, m_logger);
+        newParams[ix].vertexShader = trimTask.trim(ci.vertexShader, trimmedShaders);
+        newParams[ix].tesselationControlShader = trimTask.trim(ci.tesselationControlShader, trimmedShaders);
+        newParams[ix].tesselationEvaluationShader = trimTask.trim(ci.tesselationEvaluationShader, trimmedShaders);
+        newParams[ix].geometryShader = trimTask.trim(ci.geometryShader, trimmedShaders);
+        newParams[ix].fragmentShader = trimTask.trim(ci.fragmentShader, trimmedShaders);
     }
 
     createGraphicsPipelines_impl(pipelineCache, newParams, output, specConstantValidation);
@@ -980,10 +1007,7 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
   core::smart_refctd_ptr<IGPURayTracingPipeline>* const output)
 {
     std::fill_n(output,params.size(),nullptr);
-    IGPURayTracingPipeline::SCreationParams::SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params,[this](const asset::IPipelineBase::SShaderSpecInfo& info)->bool
-    {
-        return true;
-    });
+    SSpecializationValidationResult specConstantValidation = commonCreatePipelines(pipelineCache,params);
     if (!specConstantValidation)
     {
         NBL_LOG_ERROR("Invalid parameters were given");
@@ -1004,6 +1028,12 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
         const bool skipAABBs = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_AABBS);
         const bool skipBuiltin = bool(param.flags & IGPURayTracingPipeline::SCreationParams::FLAGS::SKIP_BUILT_IN_PRIMITIVES);
 
+        if (!features.rayTracingPipeline)
+        {
+            NBL_LOG_ERROR("Raytracing Pipeline feature not enabled!");
+            return {};
+        }
+
         // https://docs.vulkan.org/spec/latest/chapters/pipelines.html#VUID-VkRayTracingPipelineCreateInfoKHR-rayTraversalPrimitiveCulling-03597
         if (skipAABBs && skipBuiltin)
         {
@@ -1028,15 +1058,28 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
     }
 
     core::vector<IGPURayTracingPipeline::SCreationParams> newParams(params.begin(), params.end());
-    const auto shaderCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    core::vector<core::smart_refctd_ptr<const asset::IShader>> trimmedShaders; // vector to hold all the trimmed shaders, so the pointer from the new ShaderSpecInfo is not dangling
+
+    const auto missGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.misses.size();
+    });
+    const auto hitGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
+    {
+        return sum + param.shaderGroups.hits.size();
+    });
+    const auto callableGroupCount = std::accumulate(params.begin(), params.end(), 0, [](uint32_t sum, auto& param)
     {
-        return sum + param.getShaders().size();
+        return sum + param.shaderGroups.callables.size();
     });
-    core::vector<core::smart_refctd_ptr<const asset::IShader>> debloatedShaders; // vector to hold all the debloated shaders, so the pointer from the new ShaderSpecInfo is not dangling
-    debloatedShaders.reserve(shaderCount);
 
-    core::vector<asset::IPipelineBase::SShaderSpecInfo> debloatedShaderSpecs(shaderCount);
-    auto outShaderSpecs = debloatedShaderSpecs.data();
+
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> trimmedMissSpecs(missGroupCount);
+    auto trimmedMissSpecData = trimmedMissSpecs.data();
+    core::vector<IGPURayTracingPipeline::SHitGroup> trimmedHitSpecs(hitGroupCount);
+    auto trimmedHitSpecData = trimmedHitSpecs.data();
+    core::vector<IGPUPipelineBase::SShaderSpecInfo> trimmedCallableSpecs(callableGroupCount);
+    auto trimmedCallableSpecData = trimmedCallableSpecs.data();
 
     const auto& limits = getPhysicalDeviceLimits();
     for (auto ix = 0u; ix < params.size(); ix++)
@@ -1050,14 +1093,47 @@ bool ILogicalDevice::createRayTracingPipelines(IGPUPipelineCache* const pipeline
             NBL_LOG_ERROR("Invalid maxRecursionDepth. maxRecursionDepth(%u) exceed the limits(%u)", param.cached.maxRecursionDepth, limits.maxRayRecursionDepth);
             return false;
         }
-        if (param.getShaders().empty())
+
+        SpirvTrimTask trimTask(m_spirvTrimmer.get(), m_logger);
+        trimTask.insertEntryPoint(param.shaderGroups.raygen, hlsl::ShaderStage::ESS_RAYGEN);
+        for (const auto& miss : param.shaderGroups.misses)
+            trimTask.insertEntryPoint(miss, hlsl::ShaderStage::ESS_MISS);
+        for (const auto& hit : param.shaderGroups.hits)
         {
-            NBL_LOG_ERROR("Pipeline must have at least one shader.");
-            return false;
+            trimTask.insertEntryPoint(hit.closestHit, hlsl::ShaderStage::ESS_CLOSEST_HIT);
+            trimTask.insertEntryPoint(hit.anyHit, hlsl::ShaderStage::ESS_ANY_HIT);
+            trimTask.insertEntryPoint(hit.intersection, hlsl::ShaderStage::ESS_INTERSECTION);
         }
+        for (const auto& callable : param.shaderGroups.callables)
+            trimTask.insertEntryPoint(callable, hlsl::ShaderStage::ESS_CALLABLE);
+
+        newParams[ix] = param;
+        newParams[ix].shaderGroups.raygen = trimTask.trim(param.shaderGroups.raygen, trimmedShaders);
 
-        newParams[ix].shaders = std::span(outShaderSpecs, param.getShaders().size());
-        debloatShaders(*m_spirvDebloater.get(), param.getShaders(), debloatedShaders, outShaderSpecs, m_logger);
+        newParams[ix].shaderGroups.misses = trimmedMissSpecs;
+        for (const auto& miss: param.shaderGroups.misses)
+        {
+            *trimmedMissSpecData = trimTask.trim(miss, trimmedShaders);
+            trimmedMissSpecData++;
+        }
+
+        newParams[ix].shaderGroups.hits = trimmedHitSpecs;
+        for (const auto& hit: param.shaderGroups.hits)
+        {
+            *trimmedHitSpecData = {
+                .closestHit = trimTask.trim(hit.closestHit, trimmedShaders),
+                .anyHit = trimTask.trim(hit.anyHit, trimmedShaders),
+                .intersection = trimTask.trim(hit.intersection, trimmedShaders),
+            };
+            trimmedHitSpecData++;
+        }
+
+        newParams[ix].shaderGroups.callables = trimmedCallableSpecs;
+        for (const auto& callable: param.shaderGroups.callables)
+        {
+            *trimmedCallableSpecData = trimTask.trim(callable, trimmedShaders);
+            trimmedCallableSpecData++;
+        }
     }
 
     createRayTracingPipelines_impl(pipelineCache, newParams,output,specConstantValidation);
diff --git a/src/nbl/video/IQueue.cpp b/src/nbl/video/IQueue.cpp
index e761b7a733..108f76183c 100644
--- a/src/nbl/video/IQueue.cpp
+++ b/src/nbl/video/IQueue.cpp
@@ -149,15 +149,66 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
     auto outRes = m_resources->data();
     for (const auto& sema : info.waitSemaphores)
         *(outRes++) = smart_ptr(sema.semaphore);
+    // track our own versions
+    core::unordered_map<const IGPUTopLevelAccelerationStructure*,IGPUTopLevelAccelerationStructure::build_ver_t> m_readTLASVersions;
+    // get the TLAS BLAS tracking info and assign a pending build version number
+    for (const auto& cb : info.commandBuffers)
+    for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
+    {
+        const IGPUTopLevelAccelerationStructure* src = nullptr;
+        switch (var.index())
+        {
+            case 1:
+                src = std::get<1>(var).src;
+                break;
+            case 2:
+                src = std::get<2>(var).src;
+                break;
+        }
+        if (src)
+            m_readTLASVersions.insert({src,src->getPendingBuildVer()});
+    }
     for (const auto& cb : info.commandBuffers)
     {
         *(outRes++) = smart_ptr(cb.cmdbuf);
-        // get the TLAS BLAS tracking info and assign a pending build version number
-        for (const auto& refSet : cb.cmdbuf->m_TLASToBLASReferenceSets)
+        for (const auto& var : cb.cmdbuf->m_TLASTrackingOps)
+        switch (var.index())
         {
-            const auto tlas = refSet.first;
-            // in theory could assert no duplicate entries, but thats obvious
-            m_TLASToBLASReferenceSets[tlas] = { .m_BLASes = {refSet.second.begin(),refSet.second.end()}, .m_buildVer = tlas->registerNextBuildVer()};
+            case 0:
+            {
+                const IGPUCommandBuffer::TLASTrackingWrite& op = std::get<0>(var);
+
+                using iterator = decltype(op.src)::iterator;
+                m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = op.dst->pushTrackedBLASes<IGPUTopLevelAccelerationStructure::DynamicUpCastingSpanIterator>({op.src.begin()},{op.src.end()});
+                break;
+            }
+            case 1:
+            {
+                const IGPUCommandBuffer::TLASTrackingCopy& op = std::get<1>(var);
+                // not sure if even legal, but it would deadlock us
+                if (op.src==op.dst)
+                    break;
+                const auto ver = m_readTLASVersions.find(op.src)->second;
+                // stop multiple threads messing with us
+                std::lock_guard lk(op.src->m_trackingLock);
+                const auto* pSrcBLASes = op.src->getPendingBuildTrackedBLASes(ver);
+                const std::span<IGPUTopLevelAccelerationStructure::blas_smart_ptr_t> emptySpan = {};
+                m_readTLASVersions[op.dst] = m_TLASOverwrites[op.dst] = pSrcBLASes ? op.dst->pushTrackedBLASes(pSrcBLASes->begin(),pSrcBLASes->end()):op.dst->pushTrackedBLASes(emptySpan.begin(),emptySpan.end());
+                break;
+            }
+            case 2:
+            {
+                const IGPUCommandBuffer::TLASTrackingRead& op = std::get<2>(var);
+                const auto ver = m_readTLASVersions.find(op.src)->second;
+                uint32_t count = op.dst->size();
+                op.src->getPendingBuildTrackedBLASes(&count,op.dst->data(),ver);
+                if (count>op.dst->size())
+                    cb.cmdbuf->getOriginDevice()->getLogger()->log("BLAS output array too small, should be %d, only wrote out %d BLAS references to destination",system::ILogger::ELL_ERROR,count,op.dst->size());
+                break;
+            }
+            default:
+                assert(false);
+                break;
         }
     }
     // We don't hold the last signal semaphore, because the timeline does as an Event trigger.
@@ -170,10 +221,10 @@ IQueue::DeferredSubmitCallback::DeferredSubmitCallback(const SSubmitInfo& info)
 
 IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(DeferredSubmitCallback&& other)
 {
-    m_TLASToBLASReferenceSets = std::move(other.m_TLASToBLASReferenceSets);
+    m_TLASOverwrites = std::move(other.m_TLASOverwrites);
     m_resources = std::move(other.m_resources);
     m_callback = std::move(other.m_callback);
-    other.m_TLASToBLASReferenceSets = {};
+    other.m_TLASOverwrites.clear();
     other.m_resources = nullptr;
     other.m_callback = {};
 	return *this;
@@ -182,13 +233,9 @@ IQueue::DeferredSubmitCallback& IQueue::DeferredSubmitCallback::operator=(Deferr
 // always exhaustive poll, because we need to get rid of resources ASAP
 void IQueue::DeferredSubmitCallback::operator()()
 {
-    // first update tracking info (needs resources alive)
-    for (const auto& refSet : m_TLASToBLASReferenceSets)
-    {
-        const auto tlas = refSet.first;
-        const auto& blases = refSet.second.m_BLASes;
-        tlas->setTrackedBLASes(blases.begin(),blases.end(),refSet.second.m_buildVer);
-    }
+    // all builds started before ours will now get overwritten (not exactly true, but without a better tracking system, this is the best we can do for now)
+    for (const auto& build : m_TLASOverwrites)
+        build.first->clearTrackedBLASes(build.second);
     // then free all resources
     m_resources = nullptr;
     // then execute the callback
diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp
index d1615a4637..ad54409da4 100644
--- a/src/nbl/video/utilities/CAssetConverter.cpp
+++ b/src/nbl/video/utilities/CAssetConverter.cpp
@@ -410,7 +410,7 @@ class AssetVisitor : public CRTP
 		}
 
 	private:
-		// there is no `impl()` overload taking `ICPUTopLevelAccelerationStructure` same as there is no `ICPUmage`
+		// there is no `impl()` overload taking `ICPUBottomLevelAccelerationStructure` same as there is no `ICPUmage`
 		inline bool impl(const instance_t<ICPUTopLevelAccelerationStructure>& instance, const CAssetConverter::patch_t<ICPUTopLevelAccelerationStructure>& userPatch)
 		{
 			const auto blasInstances = instance.asset->getInstances();
@@ -519,8 +519,8 @@ class AssetVisitor : public CRTP
 			if (!layout || !descend(layout,{layout}))
 				return false;
 			const auto& specInfo = asset->getSpecInfo();
-			const auto* shader = specInfo.shader;
-			if (!shader || !descend(shader,{shader},specInfo))
+			const auto* shader = specInfo.shader.get();
+			if (!shader || !descend(shader,{shader},specInfo, hlsl::ESS_COMPUTE))
 				return false;
 			return true;
 		}
@@ -536,8 +536,8 @@ class AssetVisitor : public CRTP
 			using stage_t = hlsl::ShaderStage;
 			for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT})
 			{
-				const auto& specInfo = asset->getSpecInfo(stage);
-				const auto* shader = specInfo.shader;
+				const auto& specInfo = *asset->getSpecInfo(stage);
+				const auto* shader = specInfo.shader.get();
 				if (!shader)
 				{
 					if (stage==stage_t::ESS_VERTEX) // required
@@ -545,7 +545,7 @@ class AssetVisitor : public CRTP
 					CRTP::template nullOptional<IShader>();
 					continue;
 				}
-				if (!descend(shader,{shader},specInfo))
+				if (!descend(shader,{shader}, specInfo, stage))
 					return false;
 			}
 			return true;
@@ -570,8 +570,9 @@ class AssetVisitor : public CRTP
 					const IDescriptorSetLayoutBase::CBindingRedirect::storage_range_index_t storageRangeIx(j);
 					const auto binding = redirect.getBinding(storageRangeIx);
 					const uint32_t count = redirect.getCount(storageRangeIx);
-					// this is where the descriptors have their flattened place in a unified array 
-					const auto* infos = allInfos.data()+redirect.getStorageOffset(storageRangeIx).data;
+					// this is where the descriptors have their flattened place in a unified array
+					const auto storageBaseOffset = redirect.getStorageOffset(storageRangeIx);
+					const auto* infos = allInfos.data()+storageBaseOffset.data;
 					for (uint32_t el=0u; el<count; el++)
 					{
 						const auto& info = infos[el];
@@ -660,7 +661,7 @@ class AssetVisitor : public CRTP
 							case IDescriptor::EC_ACCELERATION_STRUCTURE:
 							{
 								auto tlas = static_cast<const ICPUTopLevelAccelerationStructure*>(untypedDesc);
-								if (!descend(tlas,{tlas},type,binding,el))
+								if (!descend(tlas,{tlas},type,binding,el,storageBaseOffset))
 									return false;
 								break;
 							}
@@ -1035,25 +1036,19 @@ class HashVisit : public CAssetConverter::CHashCache::hash_impl_base
 				auto argTuple = std::tuple<const ExtraArgs&...>(extraArgs...);
 				const auto& arg0 = std::get<0>(argTuple);
 				// hash the spec info
-				if constexpr (std::is_same_v<decltype(arg0),const IPipelineBase::SShaderSpecInfo&>)
+				if constexpr (std::is_same_v<decltype(arg0),const ICPUPipelineBase::SShaderSpecInfo&>)
 				{
+					const auto stage = std::get<1>(argTuple);
 					hasher << arg0.entryPoint;
-					hasher << arg0.stage;
+					assert(hlsl::bitCount(stage) == 1);
+					hasher << stage;
 					hasher << arg0.requiredSubgroupSize;
-					switch (arg0.stage)
+					if (!arg0.entries.empty())
 					{
-						case hlsl::ShaderStage::ESS_COMPUTE:
-							hasher << arg0.requireFullSubgroups;
-							break;
-						default:
-							break;
-					}
-					if (arg0.entries)
-					{
-					  for (const auto& specConstant : *arg0.entries) 
+					  for (const auto& specConstant : arg0.entries) 
 					  {
 							hasher << specConstant.first;
-					    hasher.update(specConstant.second.data, specConstant.second.size);
+					    hasher.update(specConstant.second.data(), specConstant.second.size());
 					  }
 					}
 				}
@@ -1108,6 +1103,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUBottomLevel
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
 	// finally the contents
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }
@@ -1120,6 +1117,7 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUTopLevelAcc
 	// extras from the patch
 	hasher << lookup.patch->hostBuild;
 	hasher << lookup.patch->compactAfterBuild;
+	hasher << (lookup.patch->isMotion ? lookup.patch->maxInstances:0u);
 	const auto instances = asset->getInstances();
 	hasher << instances.size();
 	AssetVisitor<HashVisit<ICPUTopLevelAccelerationStructure>> visitor = {
@@ -1186,6 +1184,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUImage> look
 		creationFlags |= create_flags_t::ECF_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT;
 	hasher << creationFlags;
 	// finally the contents
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }
@@ -1289,6 +1289,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUPipelineCac
 		if (entry.first.meta)
 			hasher.update(entry.first.meta->data(),entry.first.meta->size());
 	}
+	if (lookup.asset->getContentHash()==NoContentHash)
+		return false;
 	hasher << lookup.asset->getContentHash();
 	return true;
 }
@@ -1303,6 +1305,8 @@ bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPUComputePipe
 	};
 	if (!visitor())
 		return false;
+	const auto& params = asset->getCachedCreationParams();
+	hasher << params.requireFullSubgroups;
 	return true;
 }
 bool CAssetConverter::CHashCache::hash_impl::operator()(lookup_t<ICPURenderpass> lookup)
@@ -1611,8 +1615,7 @@ template<>
 class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependantVisitBase<ICPUTopLevelAccelerationStructure>
 {
 	public:
-		// because of zero access to the lifetime tracking between TLASes and BLASes, do nothing
-		//core::smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>* const outBLASes;
+		CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap;
 
 	protected:
 		bool descend_impl(
@@ -1624,7 +1627,7 @@ class GetDependantVisit<ICPUTopLevelAccelerationStructure> : public GetDependant
 			auto depObj = getDependant<ICPUBottomLevelAccelerationStructure>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			// outBLASes[instanceIndex] = std::move(depObj);
+			instanceMap->operator[](dep.asset) = std::move(depObj);
 			return true;
 		}
 };
@@ -1718,16 +1721,14 @@ class GetDependantVisit<ICPUComputePipeline> : public GetDependantVisitBase<ICPU
 	public:
 //		using AssetType = ICPUComputePipeline;
 
-		inline auto& getSpecInfo(const IShader::E_SHADER_STAGE stage)
+		inline auto& getSpecInfo()
 		{
-			assert(hlsl::bitCount(stage)==1);
-			return specInfo[hlsl::findLSB(stage)];
+			return specInfo;
 		}
 
 		// ok to do non owning since some cache owns anyway
 		IGPUPipelineLayout* layout = nullptr;
-		// has to be public to allow for initializer list constructor
-		std::array<IPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
+		ICPUPipelineBase::SShaderSpecInfo specInfo = {};
 
 	protected:
 		bool descend_impl(
@@ -1743,18 +1744,16 @@ class GetDependantVisit<ICPUComputePipeline> : public GetDependantVisitBase<ICPU
 		}
 		bool descend_impl(
 			const instance_t<ICPUComputePipeline>& user, const CAssetConverter::patch_t<ICPUComputePipeline>& userPatch,
-			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo
+			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage
 		)
 		{
 			auto depObj = getDependant<IShader>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			getSpecInfo(inSpecInfo.stage) = {
-				.shader = depObj.get(),
+			getSpecInfo() = ICPUPipelineBase::SShaderSpecInfo{
+				.shader = depObj,
 				.entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now!
-				.stage = inSpecInfo.stage,
 				.requiredSubgroupSize = inSpecInfo.requiredSubgroupSize,
-				.requireFullSubgroups = inSpecInfo.requireFullSubgroups,
 				.entries = inSpecInfo.entries
 			};
 			return true;
@@ -1775,7 +1774,7 @@ class GetDependantVisit<ICPUGraphicsPipeline> : public GetDependantVisitBase<ICP
 		// ok to do non owning since some cache owns anyway
 		IGPUPipelineLayout* layout = nullptr;
 		// has to be public to allow for initializer list constructor
-		std::array<IPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
+		std::array<ICPUPipelineBase::SShaderSpecInfo,/*hlsl::mpl::findMSB<ESS_COUNT>::value*/sizeof(IShader::E_SHADER_STAGE)*8> specInfo = {};
 		// optionals (done this way because inheritance chain with templated class hides protected methods)
 		IGPURenderpass* renderpass = nullptr;
 
@@ -1793,18 +1792,16 @@ class GetDependantVisit<ICPUGraphicsPipeline> : public GetDependantVisitBase<ICP
 		}
 		bool descend_impl(
 			const instance_t<ICPUGraphicsPipeline>& user, const CAssetConverter::patch_t<ICPUGraphicsPipeline>& userPatch,
-			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const IPipelineBase::SShaderSpecInfo& inSpecInfo
+			const instance_t<IShader>& dep, const CAssetConverter::patch_t<IShader>& soloPatch, const ICPUPipelineBase::SShaderSpecInfo& inSpecInfo, hlsl::ShaderStage stage
 		)
 		{
 			auto depObj = getDependant<IShader>(dep,soloPatch);
 			if (!depObj)
 				return false;
-			getSpecInfo(inSpecInfo.stage) = {
-				.shader = depObj.get(),
+			getSpecInfo(stage) = {
+				.shader = depObj,
 				.entryPoint = inSpecInfo.entryPoint, // warning: its a `string_view` now!
-				.stage = inSpecInfo.stage,
 				.requiredSubgroupSize = inSpecInfo.requiredSubgroupSize,
-				.requireFullSubgroups = 0,
 				.entries = inSpecInfo.entries
 			};
 			return true;
@@ -1828,8 +1825,6 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 		// returns if there are any writes to do
 		bool finalizeWrites(IGPUDescriptorSet* dstSet)
 		{
-			for (auto& deferredWrite : deferredTLASWrites)
-				deferredWrite.dstSet = dstSet;
 			if (writes.empty())
 				return false;
 			// now infos can't move in memory anymore
@@ -1846,7 +1841,7 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 		// okay to do non-owning, cache has ownership
 		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes = {};
 		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos = {};
-		core::vector<CAssetConverter::SReserveResult::SDeferredTLASWrite> deferredTLASWrites;
+		core::vector<IGPUDescriptorSetLayout::CBindingRedirect::storage_offset_t> potentialTLASRewrites = {};
 		// has to be public because of aggregate init, but its only for internal usage!
 		uint32_t lastBinding;
 		uint32_t lastElement;
@@ -1904,15 +1899,8 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 			else
 				writes.back().count++;
 			lastElement = element;
-			// the RLE will always finish a write because a single binding can only be a single descriptor type, important that the TLAS path happens after that check
-			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
-			{
-				deferredTLASWrites.push_back({nullptr,binding.data,element,depObj});
-				return true;
-			}
 			//
 			auto& outInfo = infos.emplace_back();
-			outInfo.desc = std::move(depObj);
 			// extra stuff
 			auto argTuple = std::tuple<const ExtraArgs&...>(extraArgs...);
 			if constexpr (std::is_same_v<DepType,ICPUBuffer>)
@@ -1920,10 +1908,18 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 				if (IDescriptor::GetTypeCategory(type)==IDescriptor::E_CATEGORY::EC_BUFFER)
 				{
 					//outInfo.info.buffer = std::get<0>(argTuple);
-					outInfo.info.buffer.offset= std::get<0>(argTuple).offset;
+					outInfo.info.buffer.offset = std::get<0>(argTuple).offset;
 					outInfo.info.buffer.size = std::get<0>(argTuple).size;
 				}
 			}
+			// mark potential TLAS rewrites (with compaction) so we don't have to scan entire descriptor set for potentially compacted TLASes
+			if constexpr (std::is_same_v<DepType,ICPUTopLevelAccelerationStructure>)
+			if (depObj->getPendingBuildVer()==0) // means not built yet, so compactable by next `convert` run
+			{
+				auto storageOffset = std::get<0>(argTuple);
+				storageOffset.data += element;
+				potentialTLASRewrites.push_back(storageOffset);
+			}
 			if constexpr (std::is_same_v<DepType,ICPUImageView>)
 			{
 				outInfo.info.image.imageLayout = std::get<0>(argTuple);
@@ -1934,25 +1930,12 @@ class GetDependantVisit<ICPUDescriptorSet> : public GetDependantVisitBase<ICPUDe
 					lastCombinedSampler = nullptr; // for debuggability
 				}
 			}
+			outInfo.desc = std::move(depObj);
 			return true;
 		}
 };
 
 
-//
-template<asset::Asset AssetType>
-struct unique_conversion_t
-{
-	const AssetType* canonicalAsset = nullptr;
-	patch_index_t patchIndex = {};
-	size_t firstCopyIx : 40 = 0u;
-	size_t copyCount : 24 = 1u;
-};
-
-// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs
-template<asset::Asset AssetType>
-using conversions_t = core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>>;
-
 // Needed both for reservation and conversion
 class MetaDeviceMemoryAllocator final
 {
@@ -1985,6 +1968,7 @@ class MetaDeviceMemoryAllocator final
 			if ((memReqs.memoryTypeBits&memoryTypeConstraint)==0)
 			{
 				m_logger.log("Overconstrained the Memory Type Index bitmask %d with %d for %s",system::ILogger::ELL_ERROR,memReqs.memoryTypeBits,memoryTypeConstraint,gpuObj->getObjectDebugName());
+				pGpuObj->value = nullptr;
 				return false;
 			}
 			//
@@ -2004,6 +1988,7 @@ class MetaDeviceMemoryAllocator final
 				if (!allocation.isValid())
 				{
 					m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName());
+					pGpuObj->value = nullptr;
 					return false;
 				}
 			}
@@ -2244,6 +2229,244 @@ class MetaDeviceMemoryAllocator final
 		core::map<MemoryRequirementBin,core::vector<memory_backed_ptr_variant_t>> allocationRequests;
 };
 
+// for dem ReBAR goodies
+bool canHostWriteToMemoryRange(const IDeviceMemoryBacked::SMemoryBinding& binding, const size_t length)
+{
+	assert(binding.isValid());
+	const auto* memory = binding.memory;
+	const auto& mappedRange = memory->getMappedRange();
+	return memory->isCurrentlyMapped() && memory->getCurrentMappingAccess().hasFlags(IDeviceMemoryAllocation::EMCAF_WRITE) && mappedRange.offset<=binding.offset && binding.offset+length<=mappedRange.offset+mappedRange.length;
+}
+
+//
+template<asset::Asset AssetType>
+struct unique_conversion_t
+{
+	const AssetType* canonicalAsset = nullptr;
+	patch_index_t patchIndex = {};
+	size_t firstCopyIx : 40 = 0u;
+	size_t copyCount : 24 = 1u;
+};
+
+//
+inline void setDebugName(const CAssetConverter* conv, IBackendObject* gpuObj, const core::blake3_hash_t& contentHash, const uint64_t uniqueCopyGroupID)
+{
+	std::ostringstream debugName;
+	debugName << "Created by Converter ";
+	debugName << std::hex;
+	debugName << conv;
+	debugName << " from Asset with hash ";
+	for (const auto& byte : contentHash.data)
+		debugName << uint32_t(byte) << " ";
+	debugName << "for Group " << uniqueCopyGroupID;
+	gpuObj->setObjectDebugName(debugName.str().c_str());
+}
+
+// Map from ContentHash to canonical asset & patch and the list of uniqueCopyGroupIDs
+template<asset::Asset AssetType>
+struct conversions_t
+{
+	public:
+		// Go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions.
+		void gather(core::tuple_transform_t<dfs_cache,CAssetConverter::supported_asset_types>& dfsCaches, CAssetConverter::CHashCache* hashCache, const CAssetConverter::CCache<AssetType>* readCache)
+		{
+			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
+			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+				{
+					// compute the hash or look it up if it exists
+					// We mistrust every dependency such that the eject/update if needed.
+					// Its really important that the Deduplication gets performed Bottom-Up
+					auto& contentHash = created.contentHash;
+					PatchOverride patchOverride(*inputs,dfsCaches,instance.uniqueCopyGroupID);
+					contentHash = hashCache->hash<AssetType>(
+						{instance.asset,&created.patch},
+						&patchOverride,
+						/*.mistrustLevel =*/ 1
+					);
+					// failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch)
+					if (contentHash==CAssetConverter::CHashCache::NoContentHash)
+					{
+						inputs->logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID);
+						return;
+					}
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+					{
+						inputs->logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]);
+					}
+					// if we have a read cache, lets retry looking the item up!
+					if (readCache)
+					{
+						// We can't look up "near misses" (supersets of patches) because they'd have different hashes
+						// and we can't afford to split hairs like finding overlapping buffer ranges, etc.
+						// Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries).
+						const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID});
+						if (found!=readCache->forwardMapEnd())
+						{
+							created.gpuObj = found->second;
+							inputs->logger.log(
+								"Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG,
+								instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+							);
+							return;
+						}
+					}
+					// The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content
+					// SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here!
+					// Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant`
+					// An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request.
+					auto* isPrehashed = dynamic_cast<const IPreHashed*>(instance.asset);
+					if (isPrehashed && isPrehashed->missingContent())
+					{
+						inputs->logger.log(
+							"PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR,
+							instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+						);
+						return;
+					}
+					// then de-duplicate the conversions needed
+					const patch_index_t patchIx = {static_cast<uint64_t>(std::distance(dfsCache.nodes.data(),&created))};
+					auto [inSetIt,inserted] = contentHashToCanonical.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
+					if (!inserted)
+					{
+						// If an element prevented insertion, the patch must be identical!
+						// Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory.
+						assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch);
+						inSetIt->second.copyCount++;
+					}
+				}
+			);
+			
+			// work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort
+			{
+				// assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order
+				auto exclScanConvReqs = [&]()->size_t
+				{
+					size_t sum = 0;
+					for (auto& entry : contentHashToCanonical)
+					{
+						entry.second.firstCopyIx = sum;
+						sum += entry.second.copyCount;
+					}
+					return sum;
+				};
+				gpuObjUniqueCopyGroupIDs.resize(exclScanConvReqs());
+				//
+				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+					{
+						if (created.gpuObj)
+							return;
+						auto found = contentHashToCanonical.find(created.contentHash);
+						// may not find things because of unconverted dummy deps
+						if (found!=contentHashToCanonical.end())
+							gpuObjUniqueCopyGroupIDs[found->second.firstCopyIx++] = instance.uniqueCopyGroupID;
+						else
+						{
+							inputs->logger.log(
+								"No conversion request made for Asset %p in group %d, its impossible to convert.",
+								system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID
+							);
+						}
+					}
+				);
+				// `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form
+				exclScanConvReqs();
+			}
+
+			// we now know the size of out output array
+			gpuObjects.resize(gpuObjUniqueCopyGroupIDs.size());
+		}
+
+		//
+		template<bool GPUObjectWhollyImmutable=false>
+		void assign(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj, const AssetType* asset=nullptr)
+		{
+			const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+			if constexpr (GPUObjectWhollyImmutable) // including any deps!
+			if (copyIx==1) // Only warn once to reduce log spam
+				inputs->logger.log(
+					"Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?",
+					system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+				);
+			//
+			if (!gpuObj)
+			{
+				inputs->logger.log(
+					"Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx",
+					system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+				);
+				return;
+			}
+			auto output = gpuObjects.data()+copyIx+baseIx;
+			output->value = std::move(gpuObj);
+			const uint64_t uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[copyIx+baseIx];
+			if constexpr (std::is_same_v<AssetType,ICPUBuffer> || std::is_same_v<AssetType,ICPUImage>)
+			{
+				const auto constrainMask = inputs->constrainMemoryTypeBits(uniqueCopyGroupID,asset,contentHash,output->value.get());
+				if (!deferredAllocator->request(output,constrainMask))
+					return;
+			}
+
+			if constexpr (!std::is_same_v<AssetType, IShader>)
+			{
+              // set debug names on everything
+              setDebugName(conv,output->get(),contentHash,uniqueCopyGroupID);
+			}
+		}
+
+		// Since the dfsCache has the original asset pointers as keys, we map in reverse (multiple `instance_t` can map to the same unique content hash and GPU object)
+		void propagateToCaches(dfs_cache<AssetType>& dfsCache, CAssetConverter::SReserveResult::staging_cache_t<AssetType>& stagingCache)
+		{
+			assert(gpuObjUniqueCopyGroupIDs.empty());
+			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+				{
+					// already found in read cache and not converted
+					if (created.gpuObj)
+						return;
+
+					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
+					const auto& contentHash = created.contentHash;
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+
+					auto found = contentHashToCanonical.find(contentHash);
+					// can happen if deps were unconverted dummies
+					if (found==contentHashToCanonical.end())
+					{
+						if (contentHash!=CAssetConverter::CHashCache::NoContentHash)
+							inputs->logger.log(
+								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
+								system::ILogger::ELL_ERROR, instance.asset, uniqueCopyGroupID, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3]
+							);
+						return;
+					}
+					// unhashables were not supposed to be added to conversion requests
+					assert(contentHash!=CAssetConverter::CHashCache::NoContentHash);
+
+					const auto copyIx = found->second.firstCopyIx++;
+					auto& gpuObj = gpuObjects[copyIx];
+					if (!gpuObj)
+					{
+						inputs->logger.log(
+							"Creation of GPU Object (or its dependents) for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
+							system::ILogger::ELL_ERROR, hashAsU64[0], hashAsU64[1], hashAsU64[2], hashAsU64[3], copyIx, found->second.canonicalAsset
+						);
+						return;
+					}
+					// insert into staging cache
+					stagingCache.emplace(gpuObj.get(),CAssetConverter::SReserveResult::staging_cache_key<AssetType>{gpuObj.value,typename CAssetConverter::CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID)});
+					// propagate back to dfsCache
+					created.gpuObj = std::move(gpuObj);
+				}
+			);
+		}
+
+		const CAssetConverter* conv;
+		const CAssetConverter::SInputs* inputs;
+		MetaDeviceMemoryAllocator* deferredAllocator;
+		core::unordered_map<core::blake3_hash_t,unique_conversion_t<AssetType>> contentHashToCanonical;
+		core::vector<size_t> gpuObjUniqueCopyGroupIDs;
+		core::vector<asset_cached_t<AssetType>> gpuObjects;
+};
+
 //
 auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 {
@@ -2486,289 +2709,157 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 		// BLAS and TLAS creation is somewhat delayed by buffer creation and allocation
 		struct DeferredASCreationParams
 		{
-			asset_cached_t<ICPUBuffer> storage;
-			size_t scratchSize : 62 = 0;
-			size_t motionBlur : 1 = false;
-			size_t compactAfterBuild : 1 = false;
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-			size_t inputSize = 0;
-			uint32_t maxInstanceCount = 0;
-#endif
+			asset_cached_t<ICPUBuffer> storage = {};
+			uint64_t scratchSize = 0;
+			uint64_t buildSize = 0;
 		};
 		core::vector<DeferredASCreationParams> accelerationStructureParams[2];
 		// Deduplication, Creation and Propagation
-		auto dedupCreateProp = [&]<Asset AssetType>()->void
+		auto dedupCreateProp = [&]<Asset AssetType>()->conversions_t<AssetType>
 		{
-			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
 			// This map contains the assets by-hash, identical asset+patch hash the same.
-			conversions_t<AssetType> conversionRequests;
+			// It only has entries for GPU objects that need to be created
+			conversions_t<AssetType> conversionRequests = {this,&inputs,&deferredAllocator};
 
-			// We now go through the dfsCache and work out each entry's content hashes, so that we can carry out unique conversions.
+			//
 			const CCache<AssetType>* readCache = inputs.readCache ? (&std::get<CCache<AssetType>>(inputs.readCache->m_caches)):nullptr;
-			dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
+			conversionRequests.gather(dfsCaches,retval.m_hashCache.get(),readCache);
+			
+			//
+			GetDependantVisitBase<AssetType> visitBase = {
+				.inputs = inputs,
+				.dfsCaches = dfsCaches
+			};
+
+			// Dispatch to correct creation of GPU objects
+			auto& dfsCache = std::get<dfs_cache<AssetType>>(dfsCaches);
+			if constexpr (std::is_same_v<AssetType,ICPUSampler>)
+			{
+				for (auto& entry : conversionRequests.contentHashToCanonical)
+				for (auto i=0ull; i<entry.second.copyCount; i++)
+					conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
+			}
+			if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
+			{
+				for (auto& entry : conversionRequests.contentHashToCanonical)
+				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
-					// compute the hash or look it up if it exists
-					// We mistrust every dependency such that the eject/update if needed.
-					// Its really important that the Deduplication gets performed Bottom-Up
-					auto& contentHash = created.contentHash;
-					PatchOverride patchOverride(inputs,dfsCaches,instance.uniqueCopyGroupID);
-					contentHash = retval.getHashCache()->hash<AssetType>(
-						{instance.asset,&created.patch},
-						&patchOverride,
-						/*.mistrustLevel =*/ 1
-					);
-					// failed to hash all together (only possible reason is failure of `PatchGetter` to provide a valid patch)
-					if (contentHash==CHashCache::NoContentHash)
-					{
-						inputs.logger.log("Could not compute hash for asset %p in group %d, maybe an IPreHashed dependant's content hash is missing?",system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID);
-						return;
-					}
-					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
+					const ICPUBuffer* asset = entry.second.canonicalAsset;
+					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
+					//
+					IGPUBuffer::SCreationParams params = {};
+					params.size = asset->getSize();
+					params.usage = patch.usage;
+					// concurrent ownership if any
+					const auto outIx = i+entry.second.firstCopyIx;
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
+					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch);
+					params.queueFamilyIndexCount = queueFamilies.size();
+					params.queueFamilyIndices = queueFamilies.data();
+					// if creation successful, we will request some memory allocation to bind to, and if thats okay we preliminarily request a conversion
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)),asset);
+				}
+			}
+			if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> || std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+			{
+				using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS;
+				const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT);
+				const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT);
+				
+				constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
+				accelerationStructureParams[IsTLAS].resize(conversionRequests.gpuObjects.size());
+				for (auto& entry : conversionRequests.contentHashToCanonical)
+				for (auto i=0ull; i<entry.second.copyCount; i++)
+				{
+					const auto* as = entry.second.canonicalAsset;
+					const auto patchIx = entry.second.patchIndex.value;
+					const auto& patch = dfsCache.nodes[patchIx].patch;
+					const bool motionBlur = patch.isMotion;
+					const auto buildFlags = patch.getBuildFlags(as);
+					const auto outIx = i+entry.second.firstCopyIx;
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
+					// prevent CPU hangs by making sure allocator big enough to service us in worst case
+					const auto minScratchAllocSize = patch.hostBuild ? inputs.scratchForHostASBuildMinAllocSize:inputs.scratchForDeviceASBuildMinAllocSize;
+					uint64_t buildSize = 0;
+					auto incrementBuildSize = [minScratchAllocSize,&buildSize](const uint64_t size, const uint32_t alignment)->void
 					{
-						inputs.logger.log("Asset (%p,%d) has hash %8llx%8llx%8llx%8llx",system::ILogger::ELL_DEBUG,instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]);
-					}
-					// if we have a read cache, lets retry looking the item up!
-					if (readCache)
+						// account for fragmentation and misalignment
+						buildSize += hlsl::max<uint64_t>(size,minScratchAllocSize)+hlsl::max<uint32_t>(minScratchAllocSize,alignment)*2;
+					};
+					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
+					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
 					{
-						// We can't look up "near misses" (supersets of patches) because they'd have different hashes
-						// and we can't afford to split hairs like finding overlapping buffer ranges, etc.
-						// Stuff like that would require a completely different hashing/lookup strategy (or multiple fake entries).
-						const auto found = readCache->find({contentHash,instance.uniqueCopyGroupID});
-						if (found!=readCache->forwardMapEnd())
-						{
-							created.gpuObj = found->second;
-							inputs.logger.log(
-								"Asset (%p,%d) with hash %8llx%8llx%8llx%8llx found its GPU Object in Read Cache",system::ILogger::ELL_DEBUG,
-								instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-							);
-							return;
-						}
-					}
-					// The conversion request we insert needs an instance asset whose unconverted dependencies don't have missing content
-					// SUPER SIMPLIFICATION: because we hash and search for readCache items bottom up (BFS), we don't need a stack (DFS) here!
-					// Any dependant that's not getting a GPU object due to missing content or GPU cache object for its cache, will show up later during `getDependant`
-					// An additional optimization would be to improve the `PatchGetter` to check dependants (only deps) during hashing for missing dfs cache gpu Object (no read cache) and no conversion request.
-					auto* isPrehashed = dynamic_cast<const IPreHashed*>(instance.asset);
-					if (isPrehashed && isPrehashed->missingContent())
-					{
-						inputs.logger.log(
-							"PreHashed Asset (%p,%d) with hash %8llx%8llx%8llx%8llx has missing content and no GPU Object in Read Cache!",system::ILogger::ELL_ERROR,
-							instance.asset,instance.uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-						);
-						return;
-					}
-					// then de-duplicate the conversions needed
-					const patch_index_t patchIx = {static_cast<uint64_t>(std::distance(dfsCache.nodes.data(),&created))};
-					auto [inSetIt,inserted] = conversionRequests.emplace(contentHash,unique_conversion_t<AssetType>{.canonicalAsset=instance.asset,.patchIndex=patchIx});
-					if (!inserted)
-					{
-						// If an element prevented insertion, the patch must be identical!
-						// Because the conversions don't care about groupIDs, the patches may be identical but not the same object in memory.
-						assert(inSetIt->second.patchIndex==patchIx || dfsCache.nodes[inSetIt->second.patchIndex.value].patch==dfsCache.nodes[patchIx.value].patch);
-						inSetIt->second.copyCount++;
-					}
-				}
-			);
-			
-			// work out mapping of `conversionRequests` to multiple GPU objects and their copy groups via counting sort
-			const auto gpuObjUniqueCopyGroupIDs = [&]()->core::vector<size_t>
-			{
-				core::vector<size_t> retval;
-				// now assign storage offsets via exclusive scan and put the `uniqueGroupID` mappings in sorted order
-				auto exclScanConvReqs = [&]()->size_t
-				{
-					size_t sum = 0;
-					for (auto& entry : conversionRequests)
-					{
-						entry.second.firstCopyIx = sum;
-						sum += entry.second.copyCount;
-					}
-					return sum;
-				};
-				retval.resize(exclScanConvReqs());
-				//
-				dfsCache.for_each([&inputs,&retval,&conversionRequests](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-					{
-						if (created.gpuObj)
-							return;
-						auto found = conversionRequests.find(created.contentHash);
-						// may not find things because of unconverted dummy deps
-						if (found!=conversionRequests.end())
-							retval[found->second.firstCopyIx++] = instance.uniqueCopyGroupID;
-						else
-						{
-							inputs.logger.log(
-								"No conversion request made for Asset %p in group %d, its impossible to convert.",
-								system::ILogger::ELL_ERROR,instance.asset,instance.uniqueCopyGroupID
-							);
-						}
-					}
-				);
-				// `{conversionRequests}.firstCopyIx` needs to be brought back down to exclusive scan form
-				exclScanConvReqs();
-				return retval;
-			}();
-
-			core::vector<asset_cached_t<AssetType>> gpuObjects(gpuObjUniqueCopyGroupIDs.size());
-			// Only warn once to reduce log spam
-			auto assign = [&]<bool GPUObjectWhollyImmutable=false>(const core::blake3_hash_t& contentHash, const size_t baseIx, const size_t copyIx, asset_cached_t<AssetType>::type&& gpuObj)->bool
-			{
-				const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-				if constexpr (GPUObjectWhollyImmutable) // including any deps!
-				if (copyIx==1)
-					inputs.logger.log(
-						"Why are you creating multiple Objects for asset content %8llx%8llx%8llx%8llx, when they are a readonly GPU Object Type with no dependants!?",
-						system::ILogger::ELL_PERFORMANCE,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-					);
-				//
-				if (!gpuObj)
-				{
-					inputs.logger.log(
-						"Failed to create GPU Object for asset content %8llx%8llx%8llx%8llx",
-						system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-					);
-					return false;
-				}
-				gpuObjects[copyIx+baseIx].value = std::move(gpuObj);
-				return true;
-			};
-
-			GetDependantVisitBase<AssetType> visitBase = {
-				.inputs = inputs,
-				.dfsCaches = dfsCaches
-			};
-			// Dispatch to correct creation of GPU objects
-			if constexpr (std::is_same_v<AssetType,ICPUSampler>)
-			{
-				for (auto& entry : conversionRequests)
-				for (auto i=0ull; i<entry.second.copyCount; i++)
-					assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createSampler(entry.second.canonicalAsset->getParams()));
-			}
-			if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-			{
-				for (auto& entry : conversionRequests)
-				for (auto i=0ull; i<entry.second.copyCount; i++)
-				{
-					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
-					//
-					IGPUBuffer::SCreationParams params = {};
-					params.size = entry.second.canonicalAsset->getSize();
-					params.usage = patch.usage;
-					// concurrent ownership if any
-					const auto outIx = i+entry.second.firstCopyIx;
-					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
-					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,entry.second.canonicalAsset,patch);
-					params.queueFamilyIndexCount = queueFamilies.size();
-					params.queueFamilyIndices = queueFamilies.data();
-					// if creation successful, we will upload
-					assign(entry.first,entry.second.firstCopyIx,i,device->createBuffer(std::move(params)));
-				}
-			}
-			if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> || std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
-			{
-				using mem_prop_f = IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS;
-				const auto deviceBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT);
-				const auto hostBuildMemoryTypes = device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(mem_prop_f::EMPF_DEVICE_LOCAL_BIT|mem_prop_f::EMPF_HOST_WRITABLE_BIT|mem_prop_f::EMPF_HOST_CACHED_BIT);
-				
-				constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
-				accelerationStructureParams[IsTLAS].resize(gpuObjects.size());
-				for (auto& entry : conversionRequests)
-				for (auto i=0ull; i<entry.second.copyCount; i++)
-				{
-					const auto* as = entry.second.canonicalAsset;
-					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
-					const bool motionBlur = as->usesMotion();
-					ILogicalDevice::AccelerationStructureBuildSizes sizes = {};
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-					// we will need to temporarily store the build input buffers somewhere
-					size_t inputSize = 0;
-					{
-						const auto buildFlags = patch.getBuildFlags(as);
 						if constexpr (IsTLAS)
 						{
-							AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
-								{visitBase},
-								{asset,uniqueCopyGroupID},
-								patch
-							};
-							if (!visitor())
-								continue;
+							// TLAS can't check for the BLASes existing yet, because they haven't had their backing buffers allocated yet
 							const auto instanceCount = as->getInstances().size();
 							sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,instanceCount);
-							inputSize = (motionBlur ? sizeof(IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance):sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance))*instanceCount;
+							// all instances need to be aligned to 16 bytes so alignment irrelevant (everything can be tightly packed) and implicit
+							const uint64_t worstCaseInstanceSize = motionBlur ? IGPUTopLevelAccelerationStructure::DevicePolymorphicInstance::LargestUnionMemberSize:sizeof(IGPUTopLevelAccelerationStructure::DeviceStaticInstance);
+							// worst case approximation is fine here
+							incrementBuildSize(worstCaseInstanceSize*instanceCount,16);
+							incrementBuildSize(sizeof(uint64_t)*instanceCount,alignof(uint64_t));
 						}
 						else
 						{
-							const uint32_t* pMaxPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
-							// the code here is not pretty, but DRY-ing is of this is for later
+							const uint32_t* pPrimitiveCounts = as->getGeometryPrimitiveCounts().data();
 							if (buildFlags.hasFlags(ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT))
 							{
 								const auto geoms = as->getAABBGeometries();
-								if (patch.hostBuild)
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-								}
-								else
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
-									for (const auto& geom : geoms)
-									if (const auto aabbCount=*(pMaxPrimitiveCounts++); aabbCount)
-										inputSize = core::roundUp(inputSize,sizeof(float))+aabbCount*geom.stride;
-								}
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
+								for (const auto& geom : geoms)
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
+									incrementBuildSize(aabbCount*geom.stride,alignof(float));
 							}
 							else
 							{
-								core::map<uint32_t,size_t> allocationsPerStride;
 								const auto geoms = as->getTriangleGeometries();
-								if (patch.hostBuild)
-								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-								}
-								else
+								sizes = device->getAccelerationStructureBuildSizes(patch.hostBuild,buildFlags,motionBlur,geoms,pPrimitiveCounts);
+								for (const auto& geom : geoms)
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
 								{
-									const std::span<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>> cpuGeoms = {
-										reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<const ICPUBuffer>*>(geoms.data()),geoms.size()
-									};
-									sizes = device->getAccelerationStructureBuildSizes(buildFlags,motionBlur,cpuGeoms,pMaxPrimitiveCounts);
-									// TODO: check if the strides need to be aligned to 4 bytes for AABBs
-									for (const auto& geom : geoms)
-									if (const auto triCount=*(pMaxPrimitiveCounts++); triCount)
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1);
+									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
+									if (geom.hasTransform())
 									{
-										switch (geom.indexType)
-										{
-											case E_INDEX_TYPE::EIT_16BIT:
-												allocationsPerStride[sizeof(uint16_t)] += triCount*3;
-												break;
-											case E_INDEX_TYPE::EIT_32BIT:
-												allocationsPerStride[sizeof(uint32_t)] += triCount*3;
-												break;
-											default:
-												break;
-										}
-										size_t bytesPerVertex = geom.vertexStride;
-										if (geom.vertexData[1])
-											bytesPerVertex += bytesPerVertex;
-										allocationsPerStride[geom.vertexStride] += geom.maxVertex;
+										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
+										alignment = hlsl::max<uint16_t>(alignof(float),alignment);
+									}
+									uint16_t indexSize = 0;
+									switch (geom.indexType)
+									{
+										case E_INDEX_TYPE::EIT_16BIT:
+											indexSize = sizeof(uint16_t);
+											break;
+										case E_INDEX_TYPE::EIT_32BIT:
+											indexSize = sizeof(uint32_t);
+											break;
+										default:
+											break;
+									}
+									if (indexSize)
+									{
+										size = core::alignUp(size,indexSize)+triCount*3*indexSize;
+										alignment = hlsl::max<uint16_t>(indexSize,alignment);
 									}
+									//inputs.logger.log("%p Triangle Data Size %d Align %d",system::ILogger::ELL_DEBUG,as,size,alignment);
+									incrementBuildSize(size,alignment);
 								}
-								for (const auto& entry : allocationsPerStride)
-									inputSize = core::roundUp<size_t>(inputSize,entry.first)+entry.first*entry.second;
 							}
 						}
 					}
-					if (!sizes)
+					if (buildSize==0 || sizes.buildScratchSize==0)
+					{
+						inputs.logger.log(
+							"Build Size Input is 0 or failed the call to `ILogicalDevice::getAccelerationStructureBuildSizes` for Acceleration Structure %8llx%8llx%8llx%8llx",
+							system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+						);
 						continue;
-#endif
+					}
+					//
+					incrementBuildSize(sizes.buildScratchSize,device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment);
+					//inputs.logger.log("%p Scratch Size %d Combined %d",system::ILogger::ELL_DEBUG,as,sizes.buildScratchSize,buildSize);
+
 					// we need to save the buffer in a side-channel for later
 					auto& out = accelerationStructureParams[IsTLAS][entry.second.firstCopyIx+i];
 					// this is where it gets a bit weird, we need to create a buffer to back the acceleration structure
@@ -2778,23 +2869,24 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						params.size = core::roundUp(sizes.accelerationStructureSize,MinASBufferAlignment);
 						params.usage = IGPUBuffer::E_USAGE_FLAGS::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|IGPUBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
 						// concurrent ownership if any
-						const auto outIx = i + entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
 						const auto queueFamilies = inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,as,patch);
 						params.queueFamilyIndexCount = queueFamilies.size();
 						params.queueFamilyIndices = queueFamilies.data();
 						out.storage.value = device->createBuffer(std::move(params));
+						if (out.storage)
+						{
+							nbl::video::setDebugName(this,out.storage.value.get(),entry.first,uniqueCopyGroupID);
+							if (!deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
+								continue;
+						}
 					}
 					out.scratchSize = sizes.buildScratchSize;
-					out.motionBlur = motionBlur;
-					out.compactAfterBuild = patch.compactAfterBuild;
-					if (out.storage && !deferredAllocator.request(&out.storage,patch.hostBuild ? hostBuildMemoryTypes:deviceBuildMemoryTypes))
-						out.storage.value = nullptr;
+					out.buildSize = buildSize;
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUImage>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					const ICPUImage* asset = entry.second.canonicalAsset;
@@ -2860,38 +2952,27 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					}
 					// concurrent ownership if any
 					const auto outIx = i+entry.second.firstCopyIx;
-					const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+					const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 					const auto queueFamilies =  inputs.getSharedOwnershipQueueFamilies(uniqueCopyGroupID,asset,patch);
 					params.queueFamilyIndexCount = queueFamilies.size();
 					params.queueFamilyIndices = queueFamilies.data();
 					// gpu image specifics
 					params.tiling = static_cast<IGPUImage::TILING>(patch.linearTiling);
 					params.preinitialized = false;
-					// if creation successful, we check what queues we need if uploading
-					if (assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params))) && !asset->getRegions().empty())
-					{
-						// for now until host_image_copy
-						retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-						// Best effort guess, without actually looking at all regions
-						// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
-						if (isDepthOrStencilFormat(patch.format) && (patch.usageFlags|patch.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
-							retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
-						// only if we upload some data can we recompute the mips
-						if (patch.recomputeMips)
-							retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
-					}
+					// if creation successful, we will request some memory allocation to bind to
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImage(std::move(params)),asset);
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUBufferView* asset = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUBufferView>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -2900,13 +2981,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						if (!visitor())
 							continue;
 						// no format promotion for buffer views
-						assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat()));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createBufferView(visitor.underlying,asset->getFormat()));
 					}
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUImageView>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUImageView* asset = entry.second.canonicalAsset;
 					const auto& cpuParams = asset->getCreationParameters();
@@ -2914,7 +2995,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUImageView>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -2943,7 +3024,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						// if underlying image had mip-chain extended then we extend our own
 						if (imageParams.mipLevels!=visitor.oldMipCount)
 							params.subresourceRange.levelCount = imageParams.mipLevels-params.subresourceRange.baseMipLevel;
-						assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params)));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createImageView(std::move(params)));
 					}
 				}
 			}
@@ -2955,22 +3036,18 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					.writeCache = inputs.writeShaderCache
 				};
 
-				// no one depend on the converted IShaders so we need to hold a smart ptr into them somewhere.
-				// This is to prevent m_stagingCache to hold a dangling pointer into IShader
-				retval.m_shaders.reserve(gpuObjUniqueCopyGroupIDs.size());
-
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				for (auto i=0ull; i<entry.second.copyCount; i++)
 				{
 					createParams.source = entry.second.canonicalAsset;
 					auto shader = device->compileShader(createParams);
 					retval.m_shaders.push_back(shader);
-					assign(entry.first,entry.second.firstCopyIx,i,std::move(shader));
+					conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(shader));
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUDescriptorSetLayout>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUDescriptorSetLayout* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3019,7 +3096,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						// visit the immutables, can't be factored out because depending on groupID the dependant might change
 						AssetVisitor<GetDependantVisit<ICPUDescriptorSetLayout>> visitor = {
 							{
@@ -3031,7 +3108,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						};
 						if (!visitor())
 							continue;
-						assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,device->createDescriptorSetLayout(bindings));
 					}
 				}
 			}
@@ -3039,7 +3116,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			{
 				core::vector<asset::SPushConstantRange> pcRanges;
 				pcRanges.reserve(CSPIRVIntrospector::MaxPushConstantsSize);
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUPipelineLayout* asset = entry.second.canonicalAsset;
 					const auto& patch = dfsCache.nodes[entry.second.patchIndex.value].patch;
@@ -3074,7 +3151,7 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUPipelineLayout>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3083,13 +3160,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						if (!visitor())
 							continue;
 						auto layout = device->createPipelineLayout(pcRanges,std::move(visitor.dsLayouts[0]),std::move(visitor.dsLayouts[1]),std::move(visitor.dsLayouts[2]),std::move(visitor.dsLayouts[3]));
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(layout));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(layout));
 					}
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUPipelineCache>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUPipelineCache* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3097,20 +3174,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
+						conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createPipelineCache(asset,false));
 					}
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUComputePipeline>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUComputePipeline* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUComputePipeline>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3120,21 +3197,22 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 							continue;
 						// ILogicalDevice::createComputePipelines is rather aggressive on the spec constant validation, so we create one pipeline at a time
 						core::smart_refctd_ptr<IGPUComputePipeline> ppln;
+						IGPUPipelineBase::SShaderEntryMap entryMap;
 						{
 							// no derivatives, special flags, etc.
 							IGPUComputePipeline::SCreationParams params = {};
 							params.layout = visitor.layout;
 							// while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to COMPUTE
-							params.shader = visitor.getSpecInfo(IShader::E_SHADER_STAGE::ESS_COMPUTE);
+							params.shader = IGPUPipelineBase::SShaderSpecInfo::create(visitor.getSpecInfo(), &entryMap);
 							device->createComputePipelines(inputs.pipelineCache,{&params,1},&ppln);
 						}
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
 					}
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPURenderpass>)
 			{
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPURenderpass* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
@@ -3142,22 +3220,20 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 					{
 						// since we don't have dependants we don't care about our group ID
 						// we create threadsafe pipeline caches, because we have no idea how they may be used
-						assign.template operator()<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
+						conversionRequests.template assign<true>(entry.first,entry.second.firstCopyIx,i,device->createRenderpass(asset->getCreationParameters()));
 					}
 				}
 			}
 			if constexpr (std::is_same_v<AssetType,ICPUGraphicsPipeline>)
 			{
-				core::vector<IPipelineBase::SShaderSpecInfo> tmpSpecInfo;
-				tmpSpecInfo.reserve(5);
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUGraphicsPipeline* asset = entry.second.canonicalAsset;
 					// there is no patching possible for this asset
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUGraphicsPipeline>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3170,24 +3246,28 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 						{
 							// no derivatives, special flags, etc.
 							IGPUGraphicsPipeline::SCreationParams params = {};
+							using SShaderEntryMap = IGPUPipelineBase::SShaderEntryMap;
+							SShaderEntryMap vertexEntryMap;
+							SShaderEntryMap tesselationControlEntryMap;
+							SShaderEntryMap tesselationEvaluationEntryMap;
+							SShaderEntryMap geometryEntryMap;
+							SShaderEntryMap fragmentEntryMap;
 							bool depNotFound = false;
 							{
 								params.layout = visitor.layout;
 								params.renderpass = visitor.renderpass;
 								// while there are patches possible for shaders, the only patch which can happen here is changing a stage from UNKNOWN to match the slot here
-								tmpSpecInfo.clear();
 								using stage_t = hlsl::ShaderStage;
-								for (stage_t stage : {stage_t::ESS_VERTEX,stage_t::ESS_TESSELLATION_CONTROL,stage_t::ESS_TESSELLATION_EVALUATION,stage_t::ESS_GEOMETRY,stage_t::ESS_FRAGMENT})
-								{
-									auto& info = visitor.getSpecInfo(stage);
-									if (info.shader)
-										tmpSpecInfo.push_back(std::move(info));
-								}
-								params.shaders = tmpSpecInfo;
+                using GPUShaderSpecInfo = IGPUPipelineBase::SShaderSpecInfo;
+								params.vertexShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_VERTEX), &vertexEntryMap);
+								params.tesselationControlShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_CONTROL), &tesselationControlEntryMap);
+								params.tesselationEvaluationShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_TESSELLATION_EVALUATION), &tesselationEvaluationEntryMap);
+								params.geometryShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_GEOMETRY), &geometryEntryMap);
+								params.fragmentShader = GPUShaderSpecInfo::create(visitor.getSpecInfo(hlsl::ESS_FRAGMENT), &fragmentEntryMap);
 							}
 							params.cached = asset->getCachedCreationParams();
 							device->createGraphicsPipelines(inputs.pipelineCache,{&params,1},&ppln);
-							assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
+							conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ppln));
 						}
 					}
 				}
@@ -3198,13 +3278,13 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 				// Descriptor Pools have large up-front slots reserved for all descriptor types, if we were to merge 
 				// multiple descriptor sets to be allocated from one pool, dropping any set wouldn't result in the
 				// reclamation of the memory used, it would at most (with the FREE pool create flag) return to pool. 
-				for (auto& entry : conversionRequests)
+				for (auto& entry : conversionRequests.contentHashToCanonical)
 				{
 					const ICPUDescriptorSet* asset = entry.second.canonicalAsset;
 					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
 						const auto outIx = i+entry.second.firstCopyIx;
-						const auto uniqueCopyGroupID = gpuObjUniqueCopyGroupIDs[outIx];
+						const auto uniqueCopyGroupID = conversionRequests.gpuObjUniqueCopyGroupIDs[outIx];
 						AssetVisitor<GetDependantVisit<ICPUDescriptorSet>> visitor = {
 							{visitBase},
 							{asset,uniqueCopyGroupID},
@@ -3229,196 +3309,153 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 								ds = nullptr;
 							}
 							else
-								retval.m_deferredTLASDescriptorWrites.insert(visitor.deferredTLASWrites.begin(),visitor.deferredTLASWrites.end());
+							for (const auto storageIx : visitor.potentialTLASRewrites)
+								retval.m_potentialTLASRewrites.insert({ds.get(),storageIx});
 						}
 						else
 							inputs.logger.log("Failed to create Descriptor Pool suited for Layout %s",system::ILogger::ELL_ERROR,layout->getObjectDebugName());
-						assign(entry.first,entry.second.firstCopyIx,i,std::move(ds));
+						conversionRequests.assign(entry.first,entry.second.firstCopyIx,i,std::move(ds));
 					}
 				}
 			}
 
-			// Propagate the results back, since the dfsCache has the original asset pointers as keys, we map in reverse
-			// This gets deferred till AFTER the Buffer Memory Allocations and Binding for Acceleration Structures
-			if constexpr (!std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure> && !std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
-				dfsCache.for_each([&](const instance_t<AssetType>& instance, dfs_cache<AssetType>::created_t& created)->void
-				{
-					auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
-					// already found in read cache and not converted
-					if (created.gpuObj)
-						return;
-
-					const auto& contentHash = created.contentHash;
-					auto found = conversionRequests.find(contentHash);
-
-					const auto uniqueCopyGroupID = instance.uniqueCopyGroupID;
-
-					const auto hashAsU64 = reinterpret_cast<const uint64_t*>(contentHash.data);
-					// can happen if deps were unconverted dummies
-					if (found==conversionRequests.end())
-					{
-						if (contentHash!=CHashCache::NoContentHash)
-							inputs.logger.log(
-								"Could not find GPU Object for Asset %p in group %ull with Content Hash %8llx%8llx%8llx%8llx",
-								system::ILogger::ELL_ERROR,instance.asset,uniqueCopyGroupID,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
-							);
-						return;
-					}
-					// unhashables were not supposed to be added to conversion requests
-					assert(contentHash!=CHashCache::NoContentHash);
-
-					const auto copyIx = found->second.firstCopyIx++;
-					// the counting sort was stable
-					assert(uniqueCopyGroupID==gpuObjUniqueCopyGroupIDs[copyIx]);
-
-					auto& gpuObj = gpuObjects[copyIx];
-					if (!gpuObj)
-					{
-						inputs.logger.log(
-							"Conversion for Content Hash %8llx%8llx%8llx%8llx Copy Index %d from Canonical Asset %p Failed.",
-							system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3],copyIx,found->second.canonicalAsset
-						);
-						return;
-					}
-					// set debug names on everything!
-					{
-						std::ostringstream debugName;
-						debugName << "Created by Converter ";
-						debugName << std::hex;
-						debugName << this;
-						debugName << " from Asset with hash ";
-						for (const auto& byte : contentHash.data)
-							debugName << uint32_t(byte) << " ";
-						debugName << "for Group " << uniqueCopyGroupID;
-
-						// IShader is ethereal not really a persistent gpu object
-						if constexpr (std::is_base_of_v<IBackendObject, AssetType>)
-							gpuObj.get()->setObjectDebugName(debugName.str().c_str());
-					}
-					// insert into staging cache
-					stagingCache.emplace(gpuObj.get(),typename CCache<AssetType>::key_t(contentHash,uniqueCopyGroupID));
-					// propagate back to dfsCache
-					created.gpuObj = std::move(gpuObj);
-					// record if a device memory allocation will be needed
-					if constexpr (std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
-					{
-						const auto constrainMask = inputs.constrainMemoryTypeBits(uniqueCopyGroupID,instance.asset,contentHash,created.gpuObj.get());
-						if (!deferredAllocator.request(&created.gpuObj,constrainMask))
-						{
-							created.gpuObj.value = nullptr;
-							return;
-						}
-					}
-					//
-					if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
-						retval.m_bufferConversions.emplace_back(SReserveResult::SConvReqBuffer{core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()});
-					if constexpr (std::is_same_v<AssetType,ICPUImage>)
-					{
-						const uint16_t recomputeMips = created.patch.recomputeMips;
-						retval.m_imageConversions.emplace_back(SReserveResult::SConversionRequestBase<asset::ICPUImage>{core::smart_refctd_ptr<const AssetType>(instance.asset),created.gpuObj.get()},recomputeMips);
-					}
-// TODO: BLAS and TLAS requests
-				}
-			);
-
+			// clear what we don't need
+			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType>)
+				conversionRequests.gpuObjUniqueCopyGroupIDs.clear();
+			// This gets deferred till AFTER the Buffer Memory Allocations and Binding
+			if constexpr (!std::is_base_of_v<IAccelerationStructure,AssetType> && !std::is_base_of_v<IDeviceMemoryBacked,typename asset_traits<AssetType>::video_t>)
+			{
+				conversionRequests.propagateToCaches(std::get<dfs_cache<AssetType>>(dfsCaches),std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches));
+				return {};
+			}
+			return conversionRequests;
 		};
-		// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
-		// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
-		// If two Asset chains are independent then we order them from most catastrophic failure to least.
-		dedupCreateProp.template operator()<ICPUBuffer>();
-		dedupCreateProp.template operator()<ICPUBottomLevelAccelerationStructure>();
-		dedupCreateProp.template operator()<ICPUTopLevelAccelerationStructure>();
-		dedupCreateProp.template operator()<ICPUImage>();
-		// now allocate the memory for buffers and images
-		deferredAllocator.finalize();
-
-		// can remove buffers from conversion requests which can be written to directly
-		{
-			core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
-			flushRanges.reserve(retval.m_bufferConversions.size());
-			std::erase_if(retval.m_bufferConversions,[&flushRanges](const SReserveResult::SConvReqBuffer& conv)->bool
-				{
-					const auto boundMemory = conv.gpuObj->getBoundMemory();
-					auto* const memory = boundMemory.memory;
-					if (!boundMemory.memory->isMappable())
-						return false;
-					const size_t size = conv.gpuObj->getSize();
-					const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size};
-					// slightly inefficient but oh well
-					void* dst = memory->map(range,IDeviceMemoryAllocation::EMCAF_WRITE);
-					memcpy(dst,conv.canonical->getPointer(),size);
-					if (boundMemory.memory->haveToMakeVisible())
-						flushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
-					return true;
-				}
-			);
-			if (!flushRanges.empty())
-				device->flushMappedMemoryRanges(flushRanges);
-			if (!retval.m_bufferConversions.empty())
-				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
-		}
-		// Deal with Deferred Creation of Acceleration structures
+		// scope so the conversion requests go our of scope early
 		{
-			for (auto asLevel=0; asLevel<2; asLevel++)
+			// The order of these calls is super important to go BOTTOM UP in terms of hashing and conversion dependants.
+			// Both so we can hash in O(Depth) and not O(Depth^2) but also so we have all the possible dependants ready.
+			// If two Asset chains are independent then we order them from most catastrophic failure to least.
+			auto bufferConversions = dedupCreateProp.template operator()<ICPUBuffer>();
+			auto blasConversions = dedupCreateProp.template operator()<ICPUBottomLevelAccelerationStructure>();
+			auto tlasConversions = dedupCreateProp.template operator()<ICPUTopLevelAccelerationStructure>();
+			auto imageConversions = dedupCreateProp.template operator()<ICPUImage>();
+			// now allocate the memory for buffers and images
+			deferredAllocator.finalize();
+
+			// enqueue successfully created buffers for conversion
+			for (auto& entry : bufferConversions.contentHashToCanonical)
+			for (auto i=0ull; i<entry.second.copyCount; i++)
+			if (auto& gpuBuff=bufferConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuBuff)
+			{
+				auto [where,inserted] = retval.m_bufferConversions.insert({gpuBuff.get(),core::smart_refctd_ptr<const ICPUBuffer>(entry.second.canonicalAsset)});
+				assert(inserted);
+			}
+			bufferConversions.propagateToCaches(std::get<dfs_cache<ICPUBuffer>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBuffer>>(retval.m_stagingCaches));
+			// Deal with Deferred Creation of Acceleration structures
 			{
-				// each of these stages must have a barrier inbetween
-				size_t scratchSizeFullParallelBuild = 0;
-				size_t scratchSizeFullParallelCompact = 0;
-				// we collect that stats AFTER making sure that the BLAS / TLAS can actually be created
-				for (const auto& deferredParams : accelerationStructureParams[asLevel])
+				auto createAccelerationStructures = [&]<typename AccelerationStructure>(conversions_t<AccelerationStructure>& requests)->void
 				{
-					// buffer failed to create/allocate
-					if (!deferredParams.storage)
-						continue;
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-					IGPUAccelerationStructure::SCreationParams baseParams;
-					{
-						auto* buf = deferredParams.storage.get();
-						const auto bufSz = buf->getSize();
-						using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
-						baseParams = {
-							.bufferRange = {.offset=0,.size=bufSz,.buffer=smart_refctd_ptr<IGPUBuffer>(buf)},
-							.flags = deferredParams.motionBlur ? create_f::MOTION_BIT:create_f::NONE
-						};
-					}
-					smart_refctd_ptr<IGPUAccelerationStructure> as;
-					if (asLevel)
-					{
-						as = device->createBottomLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
-					}
+					constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
+					//
+					std::conditional_t<IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap>* pConversions;
+					if constexpr (IsTLAS)
+						pConversions = retval.m_tlasConversions;
 					else
+						pConversions = retval.m_blasConversions;
+					// we enqueue the conversions AFTER making sure that the BLAS / TLAS can actually be created
+					for (auto& entry : requests.contentHashToCanonical)
+					for (auto i=0ull; i<entry.second.copyCount; i++)
 					{
-						as = device->createTopLevelAccelerationStructure({baseParams,deferredParams.maxInstanceCount});
+						const auto reqIx = entry.second.firstCopyIx+i;
+						if (const auto& deferredParams=accelerationStructureParams[IsTLAS][reqIx]; deferredParams.storage)
+						{
+							const auto* canonical = entry.second.canonicalAsset;
+							const auto& dfsNode = std::get<dfs_cache<AccelerationStructure>>(dfsCaches).nodes[entry.second.patchIndex.value];
+							const auto& patch = dfsNode.patch;
+							// create the AS
+							const auto bufSz = deferredParams.storage.get()->getSize();
+							IGPUAccelerationStructure::SCreationParams baseParams;
+							{
+								using create_f = IGPUAccelerationStructure::SCreationParams::FLAGS;
+								baseParams = {
+									.bufferRange = {.offset=0,.size=bufSz,.buffer=deferredParams.storage.value},
+									.flags = patch.isMotion ? create_f::MOTION_BIT:create_f::NONE
+								};
+							}
+							smart_refctd_ptr<typename asset_traits<AccelerationStructure>::video_t> as;
+							CAssetConverter::SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t blasInstanceMap;
+							if constexpr (IsTLAS)
+							{
+								// check if the BLASes we want to use for the instances were successfully allocated and created
+								AssetVisitor<GetDependantVisit<ICPUTopLevelAccelerationStructure>> visitor = {
+									{inputs,dfsCaches,&blasInstanceMap},
+									{canonical,requests.gpuObjUniqueCopyGroupIDs[reqIx]},
+									patch
+								};
+								if (!visitor())
+								{
+									const auto hashAsU64 = reinterpret_cast<const uint64_t*>(entry.first.data);
+									inputs.logger.log(
+										"Failed to find all GPU Bottom Level Acceleration Structures needed to build TLAS %8llx%8llx%8llx%8llx",
+										system::ILogger::ELL_ERROR,hashAsU64[0],hashAsU64[1],hashAsU64[2],hashAsU64[3]
+									);
+									continue;
+								}
+								as = device->createTopLevelAccelerationStructure({std::move(baseParams),patch.maxInstances});
+							}
+							else
+								as = device->createBottomLevelAccelerationStructure(std::move(baseParams));
+							if (!as)
+							{
+								inputs.logger.log("Failed to Create Acceleration Structure.",system::ILogger::ELL_ERROR);
+								continue;
+							}
+							// file the request for conversion
+							auto& request = pConversions[patch.hostBuild][as.get()];
+							request.canonical = smart_refctd_ptr<const AccelerationStructure>(canonical);
+							request.scratchSize = deferredParams.scratchSize;
+							request.compact = patch.compactAfterBuild;
+							request.buildFlags = static_cast<uint16_t>(patch.getBuildFlags(canonical).value);
+							request.buildSize = deferredParams.buildSize;
+							if constexpr (IsTLAS)
+								request.instanceMap = std::move(blasInstanceMap);
+							requests.assign(entry.first,entry.second.firstCopyIx,i,std::move(as));
+						}
 					}
-					// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
-// TODO: compute with alignment
-					const auto buildSize = deferredParams.inputSize+deferredParams.scratchSize;
-					// sizes for building 1-by-1 vs parallel, note that
-					retval.m_minASBuildScratchSize = core::max(buildSize,retval.m_minASBuildScratchSize);
-					scratchSizeFullParallelBuild += buildSize;
-					// triangles, AABBs or Instance Transforms will need to be supplied from VRAM
-#endif
+					requests.gpuObjUniqueCopyGroupIDs.clear();
+				};
+				createAccelerationStructures.template operator()<ICPUBottomLevelAccelerationStructure>(blasConversions);
+				blasConversions.propagateToCaches(std::get<dfs_cache<ICPUBottomLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUBottomLevelAccelerationStructure>>(retval.m_stagingCaches));
+				createAccelerationStructures.template operator()<ICPUTopLevelAccelerationStructure>(tlasConversions);
+				tlasConversions.propagateToCaches(std::get<dfs_cache<ICPUTopLevelAccelerationStructure>>(dfsCaches),std::get<SReserveResult::staging_cache_t<ICPUTopLevelAccelerationStructure>>(retval.m_stagingCaches));
+			}
+			// enqueue successfully created images with data to upload for conversion
+			auto& dfsCacheImages = std::get<dfs_cache<ICPUImage>>(dfsCaches);
+			for (auto& entry : imageConversions.contentHashToCanonical)
+			for (auto i=0ull; i<entry.second.copyCount; i++)
+			{
+				const auto* cpuImg = entry.second.canonicalAsset;
+				if (auto& gpuImg=imageConversions.gpuObjects[i+entry.second.firstCopyIx].value; gpuImg && !cpuImg->getRegions().empty())
+				{
+					const bool recomputeMips = dfsCacheImages.nodes[entry.second.patchIndex.value].patch.recomputeMips;
+					auto [where,inserted] = retval.m_imageConversions.insert({gpuImg.get(),SReserveResult::SConvReqImage{core::smart_refctd_ptr<const ICPUImage>(cpuImg),recomputeMips}});
+					assert(inserted);
 				}
-				// 
-//				retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild,retval.m_maxASBuildScratchSize);
 			}
-			//
-			if (retval.willDeviceASBuild())
-				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+			imageConversions.propagateToCaches(dfsCacheImages,std::get<SReserveResult::staging_cache_t<ICPUImage>>(retval.m_stagingCaches));
 		}
-		dedupCreateProp.operator()<ICPUBufferView>();
-		dedupCreateProp.operator()<ICPUImageView>();
-		dedupCreateProp.operator()<IShader>();
-		dedupCreateProp.operator()<ICPUSampler>();
-		dedupCreateProp.operator()<ICPUDescriptorSetLayout>();
-		dedupCreateProp.operator()<ICPUPipelineLayout>();
-		dedupCreateProp.operator()<ICPUPipelineCache>();
-		dedupCreateProp.operator()<ICPUComputePipeline>();
-		dedupCreateProp.operator()<ICPURenderpass>();
-		dedupCreateProp.operator()<ICPUGraphicsPipeline>();
-		dedupCreateProp.operator()<ICPUDescriptorSet>();
-//		dedupCreateProp.operator()<ICPUFramebuffer>();
-
+		dedupCreateProp.template operator()<ICPUBufferView>();
+		dedupCreateProp.template operator()<ICPUImageView>();
+		dedupCreateProp.template operator()<IShader>();
+		dedupCreateProp.template operator()<ICPUSampler>();
+		dedupCreateProp.template operator()<ICPUDescriptorSetLayout>();
+		dedupCreateProp.template operator()<ICPUPipelineLayout>();
+		dedupCreateProp.template operator()<ICPUPipelineCache>();
+		dedupCreateProp.template operator()<ICPUComputePipeline>();
+		dedupCreateProp.template operator()<ICPURenderpass>();
+		dedupCreateProp.template operator()<ICPUGraphicsPipeline>();
+		dedupCreateProp.template operator()<ICPUDescriptorSet>();
+//		dedupCreateProp.template operator()<ICPUFramebuffer>();
 	}
 
 	// write out results
@@ -3445,12 +3482,14 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 			if (const auto& gpuObj=found.gpuObj; gpuObj)
 			{
 				results[i] = gpuObj;
+#ifdef _NBL_DEBUG
 				// if something with this content hash is in the stagingCache, then it must match the `found->gpuObj`
 				if (auto finalCacheIt=stagingCache.find(gpuObj.get()); finalCacheIt!=stagingCache.end())
 				{
-					const bool matches = finalCacheIt->second==typename CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
+					const bool matches = finalCacheIt->second.cacheKey==typename CCache<AssetType>::key_t(found.contentHash,uniqueCopyGroupID);
 					assert(matches);
 				}
+#endif
 			}
 			else
 				inputs.logger.log("No GPU Object could be found or created for Root Asset %p in group %d",system::ILogger::ELL_ERROR,asset,uniqueCopyGroupID);
@@ -3458,32 +3497,219 @@ auto CAssetConverter::reserve(const SInputs& inputs) -> SReserveResult
 	};
 	core::for_each_in_tuple(inputs.assets,finalize);
 
-	retval.m_converter = core::smart_refctd_ptr<CAssetConverter>(this);
-	retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr<system::ILogger>(inputs.logger.get()));
-	return retval;
-}
-
-//
-ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResult& reservations, SConvertParams& params)
-{
-	ISemaphore::future_t<IQueue::RESULT> retval = IQueue::RESULT::OTHER_ERROR;
-	system::logger_opt_ptr logger = reservations.m_logger.get().get();
-	if (!reservations.m_converter)
+	// A failed conversion can cause dangling GPU object pointers, and needless work for objects which will die soon after, so prune with a Top-Down pass anything thats not reachable from a root
 	{
-		logger.log("Cannot call convert on an unsuccessful reserve result! Or are you attempting to do a double run of `convert` ?",system::ILogger::ELL_ERROR);
-		return retval;
-	}
-	assert(reservations.m_converter.get()==this);
-	auto device = m_params.device;
+		// we use a genious trick, if someone else is using the GPU object, the refcount must obviously be greater than 1
+		auto pruneStaging = [&]<Asset AssetType>()->void
+		{
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(retval.m_stagingCaches);
+			phmap::erase_if(stagingCache,[&retval](const auto& entry)->bool
+				{
+					if (entry.first->getReferenceCount()==1)
+					{
+						// I know what I'm doing, the hashmap is being annoying not letting you look up with const pointer key a non const pointer hashmap
+						auto* gpuObj = const_cast<asset_traits<AssetType>::video_t*>(entry.first);
+						if constexpr (std::is_same_v<AssetType,ICPUBuffer>)
+							retval.m_bufferConversions.erase(gpuObj);
+						if constexpr (std::is_same_v<AssetType,ICPUBottomLevelAccelerationStructure>)
+						for (auto i=0; i<2; i++)
+							retval.m_blasConversions[i].erase(gpuObj);
+						if constexpr (std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>)
+						for (auto i=0; i<2; i++)
+							retval.m_tlasConversions[i].erase(gpuObj);
+						if constexpr (std::is_same_v<AssetType,ICPUImage>)
+							retval.m_imageConversions.erase(gpuObj);
+						// TODO: erase from `retval.m_gpuObjects` as well
+						return true;
+					}
+					// still referenced, keep it around
+					return false;
+				}
+			);
+		};
+		// The order these are called is paramount, the Higher Level User needs to die to let go of dependants and make our Garbage Collection work
+//		pruneStaging.template operator()<ICPUFramebuffer>();
+		pruneStaging.template operator()<ICPUDescriptorSet>();
+		pruneStaging.template operator()<ICPUGraphicsPipeline>();
+		pruneStaging.template operator()<ICPURenderpass>();
+		pruneStaging.template operator()<ICPUComputePipeline>();
+		pruneStaging.template operator()<ICPUPipelineCache>();
+		pruneStaging.template operator()<ICPUPipelineLayout>();
+		pruneStaging.template operator()<ICPUDescriptorSetLayout>();
+		pruneStaging.template operator()<ICPUSampler>();
+		pruneStaging.template operator()<IShader>();
+		pruneStaging.template operator()<ICPUImageView>();
+		pruneStaging.template operator()<ICPUBufferView>();
+		pruneStaging.template operator()<ICPUImage>();
+		pruneStaging.template operator()<ICPUTopLevelAccelerationStructure>();
+		pruneStaging.template operator()<ICPUBottomLevelAccelerationStructure>();
+		pruneStaging.template operator()<ICPUBuffer>();
+	}
+
+	// only now get the queue flags
+	{
+		using q_fam_f = IQueue::FAMILY_FLAGS;
+		// acceleration structures, get scratch size
+		auto computeAccelerationStructureScratchSizes = [device,&retval]<typename AccelerationStructure>()->void
+		{
+			constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,ICPUTopLevelAccelerationStructure>;
+			const auto& limits = device->getPhysicalDevice()->getLimits();
+			const auto minScratchAlignment = limits.minAccelerationStructureScratchOffsetAlignment;
+			// index 0 is device build, 1 is host build
+			size_t scratchSizeFullParallelBuild[2] = {0,0};
+			//
+			const std::conditional_t<IsTLAS,SReserveResult::SConvReqTLASMap,SReserveResult::SConvReqBLASMap>* pConversions;
+			if constexpr (IsTLAS)
+				pConversions = retval.m_tlasConversions;
+			else
+				pConversions = retval.m_blasConversions;
+			// we collect the stats AFTER making sure only needed TLAS and BLAS will be built
+			for (auto i=0; i<2; i++)
+			for (auto req : pConversions[i])
+			{
+				const auto buildSize = req.second.buildSize;
+				// sizes for building 1-by-1 vs parallel, note that BLAS and TLAS can't be built concurrently
+				retval.m_minASBuildScratchSize[i] = core::max(retval.m_minASBuildScratchSize[i],buildSize);
+				scratchSizeFullParallelBuild[i] = core::alignUp(scratchSizeFullParallelBuild[i],minScratchAlignment)+buildSize;
+				// note that in order to compact an AS you need to allocate a buffer range whose size is known only after the build
+				if (req.second.compact)
+				{
+					const auto asSize = req.first->getCreationParams().bufferRange.size;
+					assert(core::is_aligned_to(asSize,256));
+					retval.m_compactedASMaxMemory += asSize;
+				}
+			}
+			// TLAS and BLAS can't build concurrently
+			retval.m_maxASBuildScratchSize[0] = core::max(scratchSizeFullParallelBuild[0],retval.m_maxASBuildScratchSize[0]);
+			retval.m_maxASBuildScratchSize[1] = core::max(scratchSizeFullParallelBuild[1],retval.m_maxASBuildScratchSize[1]);
+		};
+		computeAccelerationStructureScratchSizes.template operator()<ICPUBottomLevelAccelerationStructure>();
+		computeAccelerationStructureScratchSizes.template operator()<ICPUTopLevelAccelerationStructure>();
+		if (retval.willDeviceASBuild() || retval.willCompactAS())
+			retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+		// images are trickier, we can't finish iterating until all possible flags are there
+		for (auto it=retval.m_imageConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT|q_fam_f::COMPUTE_BIT|q_fam_f::GRAPHICS_BIT) && it!=retval.m_imageConversions.end(); it++)
+		{
+			const auto boundMemory = it->first->getBoundMemory();
+			assert(boundMemory.isValid());
+			// Note: with `host_image_copy` this will get conditional
+			{
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				// Best effort guess, without actually looking at all regions
+				const auto& params = it->first->getCreationParameters();
+				// https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/vkCmdCopyBufferToImage.html#VUID-vkCmdCopyBufferToImage-commandBuffer-07739
+				if (isDepthOrStencilFormat(params.format) && (params.depthUsage|params.stencilUsage).hasFlags(IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT))
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::GRAPHICS_BIT;
+				if (it->second.recomputeMips)
+					retval.m_queueFlags |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+			}
+		}
+		// buffer conversions
+		for (auto it=retval.m_bufferConversions.begin(); !retval.m_queueFlags.hasFlags(q_fam_f::TRANSFER_BIT) && it!=retval.m_bufferConversions.end(); it++)
+		{
+			const auto boundMemory = it->first->getBoundMemory();
+			assert(boundMemory.isValid());
+			if (!canHostWriteToMemoryRange(boundMemory,it->first->getSize()))
+				retval.m_queueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+		}
+	}
+
+	retval.m_converter = core::smart_refctd_ptr<CAssetConverter>(this);
+	retval.m_logger = system::logger_opt_smart_ptr(core::smart_refctd_ptr<system::ILogger>(inputs.logger.get()));
+	return retval;
+}
+
+//
+ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResult& reservations, SConvertParams& params)
+{
+	ISemaphore::future_t<IQueue::RESULT> retval = IQueue::RESULT::OTHER_ERROR;
+	system::logger_opt_ptr logger = reservations.m_logger.get().get();
+	if (!reservations.m_converter)
+	{
+		logger.log("Cannot call convert on an unsuccessful reserve result! Or are you attempting to do a double run of `convert` ?",system::ILogger::ELL_ERROR);
+		return retval;
+	}
+	assert(reservations.m_converter.get()==this);
+	auto device = m_params.device;
+
+	auto hostBufferXferIt = reservations.m_bufferConversions.begin();
+	core::vector<ILogicalDevice::MappedMemoryRange> memoryHostFlushRanges;
+	memoryHostFlushRanges.reserve(reservations.m_bufferConversions.size());
+	auto hostUploadBuffers = [&](auto&& pred)->void
+	{
+		for (; hostBufferXferIt!=reservations.m_bufferConversions.end() && pred(); hostBufferXferIt++)
+		{
+			IGPUBuffer* buff = hostBufferXferIt->first;
+			const size_t size = buff->getSize();
+			const auto boundMemory = buff->getBoundMemory();
+			if (!canHostWriteToMemoryRange(boundMemory,size))
+				continue;
+			auto* const memory = boundMemory.memory;
+			const IDeviceMemoryAllocation::MemoryRange range = {boundMemory.offset,size};
+			memcpy(reinterpret_cast<uint8_t*>(memory->getMappedPointer())+range.offset,hostBufferXferIt->second->getPointer(),size);
+			// let go of canonical asset (may free RAM)
+			hostBufferXferIt->second = nullptr;
+			if (memory->haveToMakeVisible())
+				memoryHostFlushRanges.emplace_back(memory,range.offset,range.length,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
+		}
+		if (!memoryHostFlushRanges.empty())
+		{
+			device->flushMappedMemoryRanges(memoryHostFlushRanges);
+			memoryHostFlushRanges.clear();
+		}
+	};
+
+	// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
+	core::unordered_map<const IReferenceCounted*, uint32_t> outputReverseMap;
+	core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
+		{
+			uint32_t i = 0;
+			for (const auto& gpuObj : gpuObjects)
+				outputReverseMap[gpuObj.value.get()] = i++;
+		}
+	);
+	auto markFailure = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>* canonical, typename SReserveResult::staging_cache_t<AssetType>::mapped_type* cacheNode)->void
+	{
+		// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
+		*canonical = nullptr;
+		// also drop the smart pointer from the output array so failures release memory quickly
+		const auto foundIx = outputReverseMap.find(cacheNode->gpuRef.get());
+		if (foundIx!=outputReverseMap.end())
+		{
+			auto& resultOutput = std::get<SReserveResult::vector_t<AssetType>>(reservations.m_gpuObjects);
+			resultOutput[foundIx->second].value = nullptr;
+			outputReverseMap.erase(foundIx);
+		}
+		logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,cacheNode->gpuRef->getObjectDebugName());
+		// drop smart pointer 
+		cacheNode->gpuRef = nullptr;
+	};
+
+	// want to check if deps successfully exist
+	struct SMissingDependent
+	{
+		// This only checks if whether we had to convert and failed, but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
+		inline operator bool() const {return wasInStaging && gotWiped;}
 
-	// compacted TLASes need to be substituted in cache and Descriptor Sets
+		bool wasInStaging;
+		bool gotWiped;
+	};
+	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->SMissingDependent
+	{
+		const auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+		const auto found = stagingCache.find(dep);
+		SMissingDependent retval = {.wasInStaging=found!=stagingCache.end()};
+		retval.gotWiped = retval.wasInStaging && !found->second.gpuRef;
+		return retval;
+	};
+
+	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
 	core::unordered_map<const IGPUTopLevelAccelerationStructure*,smart_refctd_ptr<IGPUTopLevelAccelerationStructure>> compactedTLASMap;
 	// Anything to do?
-	auto reqQueueFlags = reservations.m_queueFlags;
-	if (reqQueueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
+	if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
 		// whether we actually get around to doing that depends on validity and success of transfers
-		const bool shouldDoSomeCompute = reqQueueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
+		const bool shouldDoSomeCompute = reservations.m_queueFlags.hasFlags(IQueue::FAMILY_FLAGS::COMPUTE_BIT);
 		auto invalidIntended = [device,logger](const IQueue::FAMILY_FLAGS flag, const SIntendedSubmitInfo* intended)->bool
 		{
 			if (!intended || !intended->valid())
@@ -3522,13 +3748,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				return retval;
 			}
 			using buffer_usage_f = IGPUBuffer::E_USAGE_FLAGS;
-			constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_TRANSFER_DST_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
-			// we may use the staging buffer directly to skip an extra copy on small enough geometries
-			if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags))
-			{
-				logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!",system::ILogger::ELL_ERROR);
-				return retval;
-			}
+			constexpr buffer_usage_f asBuildInputFlags = buffer_usage_f::EUF_ACCELERATION_STRUCTURE_BUILD_INPUT_READ_ONLY_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			constexpr buffer_usage_f asBuildScratchFlags = buffer_usage_f::EUF_STORAGE_BUFFER_BIT|buffer_usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT;
 			auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
 			const auto& scratchParams = scratchBuffer->getCachedCreationParams();
@@ -3550,12 +3770,14 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				logger.log("Acceleration Structure Scratch Device Memory Allocator not large enough!",system::ILogger::ELL_ERROR);
 				return retval;
 			}
+			// this alignment is probably bigger than required by any Build Input
 			const auto minScratchAlignment = device->getPhysicalDevice()->getLimits().minAccelerationStructureScratchOffsetAlignment;
 			if (addrAlloc.max_alignment()<minScratchAlignment)
 			{
 				logger.log("Accceleration Structure Scratch Device Memory Allocator cannot allocate with Physical Device's minimum required AS-build scratch alignment %u",system::ILogger::ELL_ERROR,minScratchAlignment);
 				return retval;
 			}
+		// TODO: check scratchForDeviceASBuildMinAllocSize
 			// returns non-null pointer if the buffer is writeable directly byt the host
 			deviceASBuildScratchPtr = reinterpret_cast<uint8_t*>(scratchBuffer->getBoundMemory().memory->getMappedPointer());
 			// Need to use Transfer Queue and copy via staging buffer
@@ -3569,25 +3791,40 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				const auto transferFamily = params.transfer->queue->getFamilyIndex();
 				// But don't want to have to do QFOTs between Transfer and Queue Families then
 				if (transferFamily!=computeFamily)
-				if (!scratchParams.canBeUsedByQueueFamily(transferFamily))
+				if (!scratchParams.isConcurrentSharing() || !scratchParams.canBeUsedByQueueFamily(transferFamily))
 				{
 					logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and not concurrently share-able by Transfer Family %d!",system::ILogger::ELL_ERROR,transferFamily);
 					return retval;
 				}
-				reqQueueFlags |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
+				if (!scratchBuffer->getCreationParams().usage.hasFlags(buffer_usage_f::EUF_TRANSFER_DST_BIT))
+				{
+					logger.log("Acceleration Structure Scratch Device Memory Allocator not mapped and doesn't the transfer destination usage flag!",system::ILogger::ELL_ERROR);
+					return retval;
+				}
+				// Right now we copy from staging to scratch, but in the future we may use the staging buffer directly to skip an extra copy on small enough geometries
+				if (!params.utilities->getDefaultUpStreamingBuffer()->getBuffer()->getCreationParams().usage.hasFlags(asBuildInputFlags|buffer_usage_f::EUF_TRANSFER_SRC_BIT))
+				{
+					logger.log("An Acceleration Structure will be built on Device but Default UpStreaming Buffer from IUtilities doesn't have required usage flags!", system::ILogger::ELL_ERROR);
+					return retval;
+				}
 			}
 		}
 		// the elusive and exotic host builds
-		if (reservations.willHostASBuild() && !params.scratchForHostASBuild)
+		if (reservations.willHostASBuild())
 		{
-			logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
-			return retval;
+			if (!params.scratchForHostASBuild)
+			{
+				logger.log("An Acceleration Structure will be built on the Host but no Scratch Memory Allocator provided!", system::ILogger::ELL_ERROR);
+				return retval;
+			}
+			// TODO: check everything else when we actually support host builds
 		}
 		// and compacting
-		if (reservations.willCompactAS() && !params.compactedASAllocator)
+		if (reservations.willCompactAS())
 		{
-			logger.log("An Acceleration Structure will be compacted but no Device Memory Allocator provided!", system::ILogger::ELL_ERROR);
-			return retval;
+			if (!params.compactedASAllocator)
+				logger.log("Acceleration Structures will be compacted using the ILogicalDevice as the memory allocator!", system::ILogger::ELL_WARNING);
+			// note that can't check the compacted AS allocator being large enough against `reservations.m_compactedASMaxMemory`
 		}
 
 		//
@@ -3627,40 +3864,6 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			return retval;
 		}
 
-		//
-		auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->core::blake3_hash_t*
-		{
-			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-			const auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(gpuObj));
-			assert(found!=stagingCache.end());
-			return const_cast<core::blake3_hash_t*>(&found->second.value);
-		};
-		// wipe gpu item in staging cache (this may drop it as well if it was made for only a root asset == no users)
-		core::unordered_map<const IReferenceCounted*,uint32_t> outputReverseMap;
-		core::for_each_in_tuple(reservations.m_gpuObjects,[&outputReverseMap](const auto& gpuObjects)->void
-			{
-				uint32_t i = 0;
-				for (const auto& gpuObj : gpuObjects)
-					outputReverseMap[gpuObj.value.get()] = i++;
-			}
-		);
-		auto markFailureInStaging = [&reservations,&outputReverseMap,logger]<Asset AssetType>(const char* message, smart_refctd_ptr<const AssetType>& canonical, const typename asset_traits<AssetType>::video_t* gpuObj, core::blake3_hash_t* hash)->void
-		{
-			// wipe the smart pointer to the canonical, make sure we release that memory ASAP if no other user is around
-			canonical = nullptr;
-			logger.log("%s failed for \"%s\"",system::ILogger::ELL_ERROR,message,gpuObj->getObjectDebugName());
-			// change the content hash on the reverse map to a NoContentHash
-			*hash = CHashCache::NoContentHash;
-			// also drop the smart pointer from the output array so failures release memory quickly
-			const auto foundIx = outputReverseMap.find(gpuObj);
-			if (foundIx!=outputReverseMap.end())
-			{
-				auto& resultOutput = std::get<SReserveResult::vector_t<AssetType>>(reservations.m_gpuObjects);
-				resultOutput[foundIx->second].value = nullptr;
-				outputReverseMap.erase(foundIx);
-			}
-		};
-
 		//
 		core::bitflag<IQueue::FAMILY_FLAGS> submitsNeeded = IQueue::FAMILY_FLAGS::NONE;
 
@@ -3712,6 +3915,15 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		// some state so we don't need to look later
 		auto xferCmdBuf = shouldDoSomeTransfer ? params.transfer->getCommandBufferForRecording():nullptr;
 
+		//
+		auto findInStaging = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* gpuObj)->auto
+		{
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+			const auto found = stagingCache.find(gpuObj);
+			assert(found!=stagingCache.end());
+			return found;
+		};
+
 		using buffer_mem_barrier_t = IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier>;
 		// upload Buffers
 		auto& buffersToUpload = reservations.m_bufferConversions;
@@ -3719,38 +3931,38 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			core::vector<buffer_mem_barrier_t> finalReleases;
 			finalReleases.reserve(buffersToUpload.size());
 			// do the uploads
-			if (!buffersToUpload.empty())
+			if (!buffersToUpload.empty() && xferCmdBuf)
 			{
 				xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers START");
 				xferCmdBuf->cmdbuf->endDebugMarker();
 			}
 			for (auto& item : buffersToUpload)
 			{
-				auto* buffer = item.gpuObj;
-				const SBufferRange<IGPUBuffer> range = {
-					.offset = 0,
-					.size = item.gpuObj->getCreationParams().size,
-					.buffer = core::smart_refctd_ptr<IGPUBuffer>(buffer)
-				};
-				auto pFoundHash = findInStaging.template operator()<ICPUBuffer>(buffer);
+				auto* buffer = item.first;
+				const size_t size = buffer->getCreationParams().size;
+				// host will upload
+				if (canHostWriteToMemoryRange(buffer->getBoundMemory(),size))
+					continue;
+				auto pFound = &findInStaging.template operator()<ICPUBuffer>(buffer)->second;
 				//
-				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,*pFoundHash),transferFamily);
+				const auto ownerQueueFamily = checkOwnership(buffer,params.getFinalOwnerQueueFamily(buffer,pFound->cacheKey.value),transferFamily);
 				if (ownerQueueFamily==QueueFamilyInvalid)
 				{
-					markFailureInStaging("invalid Final Queue Family given by user callback",item.canonical,buffer,pFoundHash);
+					markFailure("invalid Final Queue Family given by user callback",&item.second,pFound);
 					continue;
 				}
 				// do the upload
-				const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.canonical->getPointer());
+				const SBufferRange<IGPUBuffer> range = {.offset=0,.size=size,.buffer=core::smart_refctd_ptr<IGPUBuffer>(buffer)};
+				const bool success = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,item.second->getPointer());
 				// current recording buffer may have changed
 				xferCmdBuf = params.transfer->getCommandBufferForRecording();
 				if (!success)
 				{
-					markFailureInStaging("Data Upload",item.canonical,buffer,pFoundHash);
+					markFailure("Data Upload",&item.second,pFound);
 					continue;
 				}
 				// let go of canonical asset (may free RAM)
-				item.canonical = nullptr;
+				item.second = nullptr;
 				submitsNeeded |= IQueue::FAMILY_FLAGS::TRANSFER_BIT;
 				// enqueue ownership release if necessary
 				if (ownerQueueFamily!=IQueue::FamilyIgnored)
@@ -3767,12 +3979,11 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						.range = range
 					});
 			}
-			if (!buffersToUpload.empty())
+			if (!buffersToUpload.empty() && xferCmdBuf)
 			{
 				xferCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Upload Buffers END");
 				xferCmdBuf->cmdbuf->endDebugMarker();
 			}
-			buffersToUpload.clear();
 			// release ownership
 			if (!finalReleases.empty())
 				pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers=finalReleases},"Ownership Releases of Buffers Failed");
@@ -3782,7 +3993,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 		
 		// whenever transfer needs to do a submit overflow because it ran out of memory for streaming, we can already submit the recorded compute shader dispatches
 		auto computeCmdBuf = shouldDoSomeCompute ? params.compute->getCommandBufferForRecording():nullptr;
-		auto drainCompute = [&params,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
+		auto drainCompute = [&params,shouldDoSomeTransfer,&computeCmdBuf](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
 			if (!computeCmdBuf || computeCmdBuf->cmdbuf->empty())
 				return IQueue::RESULT::SUCCESS;
@@ -3790,15 +4001,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			auto& waitSemaphoreSpan = params.compute->waitSemaphores;
 			std::unique_ptr<IQueue::SSubmitInfo::SSemaphoreInfo[]> patchedWaits;
 			// the transfer scratch semaphore value, is from the last submit, not the future value we're enqueing all the deferred memory releases with
-			if (waitSemaphoreSpan.empty())
-				waitSemaphoreSpan = {&params.transfer->scratchSemaphore,1};
-			else
+			if (shouldDoSomeTransfer)
 			{
-				const auto origCount = waitSemaphoreSpan.size();
-				patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]);
-				std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get());
-				patchedWaits[origCount] = params.transfer->scratchSemaphore;
-				waitSemaphoreSpan = {patchedWaits.get(),origCount+1};
+				if (waitSemaphoreSpan.empty())
+					waitSemaphoreSpan = {&params.transfer->scratchSemaphore,1};
+				else
+				{
+					const auto origCount = waitSemaphoreSpan.size();
+					patchedWaits.reset(new IQueue::SSubmitInfo::SSemaphoreInfo[origCount+1]);
+					std::copy(waitSemaphoreSpan.begin(),waitSemaphoreSpan.end(),patchedWaits.get());
+					patchedWaits[origCount] = params.transfer->scratchSemaphore;
+					waitSemaphoreSpan = {patchedWaits.get(),origCount+1};
+				}
 			}
 			// don't worry about resetting old `waitSemaphores` because they get cleared to an empty span after overflow submit
             IQueue::RESULT res = params.compute->submit(computeCmdBuf,extraSignal);
@@ -3810,15 +4024,20 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
                 return IQueue::RESULT::OTHER_ERROR;
 			return res;
 		};
-		// compose our overflow callback on top of what's already there, only if we need to ofc 
-		auto origXferStallCallback = params.transfer->overflowCallback;
-		if (shouldDoSomeCompute)
-			params.transfer->overflowCallback = [&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
+
+		// We want to be doing Host operations while stalled for GPU, compose our overflow callback on top of what's already there, only if we need to ofc 
+		std::function<void(const ISemaphore::SWaitInfo&)> origXferStallCallback;
+		if (shouldDoSomeTransfer)
+		{
+			origXferStallCallback = std::move(params.transfer->overflowCallback);
+			params.transfer->overflowCallback = [device,&hostUploadBuffers,&origXferStallCallback,&drainCompute](const ISemaphore::SWaitInfo& tillScratchResettable)->void
 			{
 				drainCompute();
 				if (origXferStallCallback)
 					origXferStallCallback(tillScratchResettable);
+				hostUploadBuffers([device,&tillScratchResettable]()->bool{return device->waitForSemaphores({&tillScratchResettable,1},false,0)==ISemaphore::WAIT_RESULT::TIMEOUT;});
 			};
+		}
 		// when overflowing compute resources, we need to submit the Xfer before submitting Compute
 		auto drainBoth = [&params,&xferCmdBuf,&drainCompute](const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> extraSignal={})->auto
 		{
@@ -3893,7 +4112,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				return true;
 			};
 
-			// because of the layout transitions
+			// because of the layout transitions (TODO: conditional when host_image_copy gets implemented)
 			params.transfer->scratchSemaphore.stageMask |= PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
 // TODO:: Shall we rewrite? e.g. we upload everything first, extra submit for QFOT pipeline barrier & transition in overflow callback, then record compute commands, and submit them, plus their final QFOTs
 			// Lets analyze sync cases:
@@ -3910,9 +4129,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			for (auto& item : imagesToUpload)
 			{
 				// basiscs
-				const auto* cpuImg = item.canonical.get();
-				auto* image = item.gpuObj;
-				auto pFoundHash = findInStaging.template operator()<ICPUImage>(image);
+				auto& cpuImg = item.second.canonical;
+				auto* image = item.first;
+				auto pFound = &findInStaging.template operator()<ICPUImage>(image)->second;
 				// get params
 				const auto& creationParams = image->getCreationParameters();
 				const auto format = creationParams.format;
@@ -3930,7 +4149,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				});
 				IGPUImageView::E_TYPE viewType = IGPUImageView::E_TYPE::ET_2D_ARRAY;
 				// create Mipmapping source Image View, allocate its place in the descriptor set and write it
-				if (item.recomputeMips)
+				if (item.second.recomputeMips)
 				{
 					switch (creationParams.type)
 					{
@@ -3962,7 +4181,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					}
 					if (!quickWriteDescriptor(SrcMipBinding,srcIx,std::move(srcView)))
 					{
-						markFailureInStaging("Source Mip Level Descriptor Write",item.canonical,image,pFoundHash);
+						markFailure("Source Mip Level Descriptor Write",&cpuImg,pFound);
 						continue;
 					}
 				}
@@ -3971,7 +4190,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 				{
 					// Transfer and Compute barriers get recorded for image individually (see the TODO why its horrible)
 					// so we only need to worry about QFOTs for current image if they even exist
-					if (item.recomputeMips && !transferBarriers.empty())
+					if (item.second.recomputeMips && !transferBarriers.empty())
 					{
 						// so now we need a immeidate QFOT Release cause we already recorded some compute mipmapping for current image
 						if (pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Recording QFOT Release from Transfer Queue Family after overflow failed"))
@@ -3983,7 +4202,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						}
 						else
 						{
-							markFailureInStaging("Image QFOT Pipeline Barrier",item.canonical,image,pFoundHash);
+							markFailure("Image QFOT Pipeline Barrier",&cpuImg,pFound);
 							return false;
 						}
 						return true;
@@ -3999,6 +4218,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					computeBarriers.clear();
 					const bool concurrentSharing = image->getCachedCreationParams().isConcurrentSharing();
 					uint8_t lvl = 0;
+					const auto recomputeMipMask = item.second.recomputeMips;
 					bool _prevRecompute = false;
 					for (; lvl<creationParams.mipLevels; lvl++)
 					{
@@ -4029,9 +4249,9 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 						// if any op, it will always be a release (Except acquisition of first source mip in compute)
 						barrier.ownershipOp = ownership_op_t::RELEASE;
 						// if we're recomputing this mip level 
-						const bool recomputeMip = lvl && (item.recomputeMips&(0x1u<<(lvl-1)));
+						const bool recomputeMip = lvl && (recomputeMipMask&(0x1u<<(lvl-1)));
 						// query final layout from callback
-						const auto finalLayout = params.getFinalLayout(image,*pFoundHash,lvl);
+						const auto finalLayout = params.getFinalLayout(image,pFound->cacheKey.value,lvl);
 						// get region data for upload
 						auto regions = cpuImg->getRegions(lvl);
 						// basic error checks
@@ -4042,7 +4262,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 							logger.log("What are you doing requesting layout UNDEFINED for mip level % of image %s after Upload or Mip Recomputation!?",system::ILogger::ELL_ERROR,lvl,image->getObjectDebugName());
 							break;
 						}
-						const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,*pFoundHash,lvl);
+						const auto suggestedFinalOwner = params.getFinalOwnerQueueFamily(image,pFound->cacheKey.value,lvl);
 						// if we'll recompute the mipmap, then do the layout transition on the compute queue (there's one less potential QFOT)
 						if (recomputeMip)
 						{
@@ -4228,7 +4448,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 								barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
 								barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
 								// whether next mip will need to read from this one to recompute itself
-								const bool sourceForNextMipCompute = item.recomputeMips&(0x1u<<lvl);
+								const bool sourceForNextMipCompute = item.second.recomputeMips&(0x1u<<lvl);
 								// keep in general layout to avoid a transfer->general transition
 								tmp.newLayout = sourceForNextMipCompute ? layout_t::GENERAL : layout_t::TRANSFER_DST_OPTIMAL;
 								// fire off the pipeline barrier so we can start uploading right away
@@ -4297,18 +4517,18 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					// failed in the for-loop
 					if (lvl != creationParams.mipLevels)
 					{
-						markFailureInStaging("Compute Mip Mapping",item.canonical,image,pFoundHash);
+						markFailure("Compute Mip Mapping",&cpuImg,pFound);
 						continue;
 					}
 					// let go of canonical asset (may free RAM)
-					item.canonical = nullptr;
+					cpuImg = nullptr;
 				}
 				// here we only record barriers that do final layout transitions and release ownership to final queue family
 				if (!transferBarriers.empty())
 				{
 					if (!pipelineBarrier(xferCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=transferBarriers},"Final Pipeline Barrier recording to Transfer Command Buffer failed"))
 					{
-						markFailureInStaging("Image Data Upload Pipeline Barrier",item.canonical,image,pFoundHash);
+						markFailure("Image Data Upload Pipeline Barrier",&cpuImg,pFound);
 						continue;
 					}
 					// even if no uploads performed, we do layout transitions on empty images from Xfer Queue
@@ -4320,7 +4540,7 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 					dsAlloc->multi_deallocate(SrcMipBinding,1,&srcIx,params.compute->getFutureScratchSemaphore());
 					if (!pipelineBarrier(computeCmdBuf,{.memBarriers={},.bufBarriers={},.imgBarriers=computeBarriers},"Final Pipeline Barrier recording to Compute Command Buffer failed"))
 					{
-						markFailureInStaging("Compute Mip Mapping Pipeline Barrier",item.canonical,image,pFoundHash);
+						markFailure("Compute Mip Mapping Pipeline Barrier",&cpuImg,pFound);
 						continue;
 					}
 				}
@@ -4344,258 +4564,302 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
 			auto& tlasesToBuild = reservations.m_tlasConversions[0];
 			const auto blasCount = blasesToBuild.size();
 			const auto tlasCount = tlasesToBuild.size();
-			const auto maxASCount = hlsl::max(tlasCount,blasCount);
 			ownershipTransfers.reserve(blasCount+tlasCount);
 
-			auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
-			core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
-			const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible();
-			if (manualFlush) // BLAS builds do max 3 writes each TLAS builds do max 2 writes each
-				flushRanges.reserve(hlsl::max<uint32_t>(blasCount*3,tlasCount*2));
 
 			// Right now we build all BLAS first, then all TLAS
 			// (didn't fancy horrible concurrency managment taking compactions into account)
 			auto queryPool = device->createQueryPool({.queryCount=hlsl::max<uint32_t>(blasCount,tlasCount),.queryType=IQueryPool::ACCELERATION_STRUCTURE_COMPACTED_SIZE});
 			
-			const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = {
-				// the last use of the source BLAS could have been a build or a compaction
-				.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-				.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
-				.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-				.dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT
-			};
-			// lambdas!
-			auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool
-			{
-				if (deviceASBuildScratchPtr)
-				{
-					callback(deviceASBuildScratchPtr+offset,0ull,size);
-					if (manualFlush)
-						flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
-					return true;
-				}
-				else if (const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)}; params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback))
-					return true;
-				else
-					return false;
-			};
-			//
-			auto recordBuildCommandsBase = [&](auto& buildInfos, auto& rangeInfos)->void
-			{
-				if (buildInfos.empty())
-					return;
-				// Lets analyze sync cases:
-				// - Mapped Host write = no barrier, flush & optional submit sufficient
-				// - Single Queue = Global Memory Barrier
-				// - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
-				// - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer !
-				bool success = !uniQueue || !deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!");
-				//
-				success = success && computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data());
-				if (!success)
-				for (const auto& info : buildInfos)
-				{
-					const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(info.dstAS);
-					smart_refctd_ptr<const ICPUTopLevelAccelerationStructure> dummy; // already null at this point
-					markFailureInStaging("AS Build Command Recording",dummy,info.dstAS,pFoundHash);
-				}
-				buildInfos.clear();
-				rangeInfos.clear();
-			};
-
-			// Not messing around with listing AS backing buffers individually, ergonomics of that are null 
-			const asset::SMemoryBarrier readASInASCompactBarrier = {
-				.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-				.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
-				.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
-				.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
-			};
-
-			// Device BLAS builds
-			if (blasCount)
-			{
-				core::vector<const IGPUAccelerationStructure*> compactions;
-				// build
-				{
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes START");
-					computeCmdBuf->cmdbuf->endDebugMarker();
-#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION
-			constexpr auto GeometryIsAABBFlag = ICPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
-
-			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> buildInfos; buildInfos.reserve(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::DeviceBuildInfo> rangeInfo; rangeInfo.reserve(blasCount);
-			core::vector<IGPUBottomLevelAccelerationStructure::Triangles<const IGPUBuffer>> triangles;
-			core::vector<IGPUBottomLevelAccelerationStructure::AABBs<const IGPUBuffer>> aabbs;
-			{
-				size_t totalTriGeoCount = 0;
-				size_t totalAABBGeoCount = 0;
-				for (auto& item : blasToBuild)
-				{
-					const size_t geoCount = item.canonical->getGeometryCount();
-					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
-						totalAABBGeoCount += geoCount;
-					else
-						totalTriGeoCount += geoCount;
-				}
-				triangles.reserve(totalTriGeoCount);
-				triangles.reserve(totalAABBGeoCount);
-			}
-			for (auto& item : blasToBuild)
+			// leftover for TLAS builds
+			using compacted_blas_map_t = unordered_map<const IGPUBottomLevelAccelerationStructure*,smart_refctd_ptr<IGPUBottomLevelAccelerationStructure>>;
+			compacted_blas_map_t compactedBLASMap;
+			bool failedBLASBarrier = false;
+			// returns a map of compacted Acceleration Structures
+			auto buildAndCompactASes = [&]<typename AccelerationStructure>(auto& asesToBuild)->unordered_map<const AccelerationStructure*,smart_refctd_ptr<AccelerationStructure>>
 			{
-				auto* as = item.gpuObj;
-				auto pFoundHash = findInStaging.template operator()<ICPUBottomLevelAccelerationStructure>(as);
-				if (item.asBuildParams.host)
-				{
-					auto dOp = device->createDeferredOperation();
-					//
-					if (!device->buildAccelerationStructure(dOp.get(),info,range))
-					{
-						markFailureInStaging("BLAS Build Command Recording",item.canonical,gpuObj,pFoundHash);
-						continue;
-					}
-				}
-				else
-				{
-					auto& buildInfo = buildInfo.emplace_back({
-						.buildFlags  = item.buildFlags,
-						.geometryCount = item.canonical->getGeometryCount(),
-						// this is not an update
-						.srcAS = nullptr,
-						.dstAS = as.get()
-					});
-					if (item.canonical->getBuildFlags().hasFlags(GeometryIsAABBFlag))
-						buildInfo.aabbs = nullptr;
-					else
-						buildInfo.triangles = nullptr;
-					computeCmdBuf->cmdbuf->buildAccelerationStructures(buildInfo,rangeInfo);
-				}
-			}
-#endif
-					if (!compactions.empty())
-					{
-						// submit cause host needs to read the queries
-						drainCompute();
-					}
-					// want to launch the BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch so more is available for TLAS builds
-					else if (tlasCount)
-						drainCompute();
-					blasesToBuild.clear();
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build BLASes END");
-					computeCmdBuf->cmdbuf->endDebugMarker();
-				}
-				// compact
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				{
-					// the already compacted BLASes need to be written into the TLASes using them, want to swap them out ASAP
-//reservations.m_blasBuildMap[canonical].gpuBLAS = compacted;
-				}
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact BLASes END");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-			}
+				const auto asCount = asesToBuild.size();
+				if (asCount==0)
+					return {};
+				
+				constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+				using CPUAccelerationStructure = std::conditional_t<IsTLAS,ICPUTopLevelAccelerationStructure,ICPUBottomLevelAccelerationStructure>;
 
-			// Device TLAS builds
-			if (tlasCount)
-			{
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build TLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				// A single pipeline barrier to ensure BLASes build before TLASes is needed
-				const asset::SMemoryBarrier readBLASInTLASBuildBarrier = {
-					// the last use of the source BLAS could have been a build or a compaction
-					.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
-					.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
-					.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
-					.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
-				};
-				// either we built no BLASes (remember we could retrieve already built ones from cache) or we barrier for the previous compactions or builds
-				const bool failedBLASBarrier = blasCount && !pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!");
-				// TLAS compactions to do later
 				core::vector<const IGPUAccelerationStructure*> compactions;
 				// 0xffFFffFFu when not releasing ownership, otherwise index into `ownershipTransfers` where the ownership release for the old buffer was
 				core::vector<uint32_t> compactedOwnershipReleaseIndices;
-				compactions.reserve(tlasCount);
-				compactedOwnershipReleaseIndices.reserve(tlasCount);
+				compactions.reserve(asCount);
+				compactedOwnershipReleaseIndices.reserve(asCount);
 				// build
 				{
-					//
-					core::vector<IGPUTopLevelAccelerationStructure::DeviceBuildInfo> buildInfos;
-					buildInfos.reserve(tlasCount);
-					core::vector<IGPUTopLevelAccelerationStructure::BuildRangeInfo> rangeInfos;
-					rangeInfos.reserve(tlasCount);
-					core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> trackedBLASes;
-					trackedBLASes.reserve(maxASCount);
-					auto recordBuildCommands = [&]()->void
+					auto* scratchBuffer = params.scratchForDeviceASBuild->getBuffer();
+					core::vector<ILogicalDevice::MappedMemoryRange> flushRanges;
+					const bool manualFlush = scratchBuffer->getBoundMemory().memory->haveToMakeVisible();
+					if (deviceASBuildScratchPtr && manualFlush) // TLAS builds do max 2 writes each and BLAS do much more anyway
+						flushRanges.reserve(asCount*2);
+					// lambdas!
+					auto streamDataToScratch = [&](const size_t offset, const size_t size,IUtilities::IUpstreamingDataProducer& callback) -> bool
 					{
-						// rewrite the trackedBLASes pointers
-						for (auto& info : buildInfos)
+						if (deviceASBuildScratchPtr)
 						{
-							const auto offset = info.trackedBLASes.data();
-							const auto correctPtr = trackedBLASes.data()+reinterpret_cast<const size_t&>(offset);
-							info.trackedBLASes = {reinterpret_cast<const IGPUBottomLevelAccelerationStructure** const&>(correctPtr),info.trackedBLASes.size()};
+							callback(deviceASBuildScratchPtr+offset,0ull,size);
+							if (manualFlush)
+								flushRanges.emplace_back(scratchBuffer->getBoundMemory().memory,offset,size,ILogicalDevice::MappedMemoryRange::align_non_coherent_tag);
+							return true;
+						}
+						else
+						{
+							const SBufferRange<IGPUBuffer> range={.offset=offset,.size=size,.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+							const bool retval = params.utilities->updateBufferRangeViaStagingBuffer(*params.transfer,range,callback);
+							// current recording buffer may have changed
+							xferCmdBuf = params.transfer->getCommandBufferForRecording();
+							return retval;
 						}
-						recordBuildCommandsBase(buildInfos,rangeInfos);
-						trackedBLASes.clear();
 					};
 					//
+					core::vector<typename AccelerationStructure::DeviceBuildInfo> buildInfos;
+					buildInfos.reserve(asCount);
+					using build_range_info_t = std::conditional_t<IsTLAS,typename AccelerationStructure::BuildRangeInfo,const typename AccelerationStructure::BuildRangeInfo*>;
+					core::vector<build_range_info_t> rangeInfos;
+					rangeInfos.reserve(asCount);
 					using scratch_allocator_t = std::remove_reference_t<decltype(*params.scratchForDeviceASBuild)>;
 					using addr_t = typename scratch_allocator_t::size_type;
+					core::vector<addr_t> allocOffsets;
+					allocOffsets.reserve(asCount);
+					core::vector<addr_t> allocSizes;
+					allocSizes.reserve(asCount);
+					// BLAS and TLAS specific things
+					core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> geometryRangeInfo;
+					core::vector<IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>> triangles;
+					core::vector<IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>> aabbs;
+					core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> trackedBLASes;
+					if constexpr (IsTLAS)
+						trackedBLASes.reserve(asCount);
+					else // would have to count total geometries in BLASes to initialize properly, and we probably don't want to over-reserve
+					{
+						geometryRangeInfo.reserve(asCount);
+						triangles.reserve(asCount);
+						aabbs.reserve(asCount);
+					}
+					//
+					core::vector<addr_t> alignments;
+					alignments.reserve(asCount*2);
+					constexpr auto GeometryIsAABBFlag = IGPUBottomLevelAccelerationStructure::BUILD_FLAGS::GEOMETRY_TYPE_IS_AABB_BIT;
+					auto recordBuildCommands = [&]()->void
+					{
+						bool success = !buildInfos.empty();
+						// Lets analyze sync cases:
+						// - Mapped Host write = no barrier, flush & optional submit sufficient
+						// - Single Queue = Global Memory Barrier
+						// - Two distinct Queues = no barrier, semaphore signal-wait is sufficient
+						// - Two distinct Queue Families Exclusive Sharing mode = QFOT necessary but we require concurrent sharing on the scratch buffer !
+						if (success)
+						{
+							const asset::SMemoryBarrier readGeometryOrInstanceInASBuildBarrier = {
+								// the last use of the source BLAS could have been a build or a compaction
+								.srcStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+								.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT,
+								.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+								.dstAccessMask = ACCESS_FLAGS::STORAGE_READ_BIT
+							};
+							success = !uniQueue || deviceASBuildScratchPtr || pipelineBarrier(computeCmdBuf,{.memBarriers={&readGeometryOrInstanceInASBuildBarrier,1}},"Pipeline Barriers of Acceleration Structure backing Buffers failed!");
+						}
+						//
+						constexpr bool IsTLAS = std::is_same_v<AccelerationStructure,IGPUTopLevelAccelerationStructure>;
+						if (success)
+						{
+							// rewrite the based pointers
+							if constexpr (IsTLAS)
+							for (auto& info : buildInfos)
+							{
+								const auto offset = info.trackedBLASes.data();
+								const auto correctPtr = trackedBLASes.data()+reinterpret_cast<const size_t&>(offset);
+								info.trackedBLASes = {reinterpret_cast<const IGPUBottomLevelAccelerationStructure** const&>(correctPtr),info.trackedBLASes.size()};
+							}
+							else
+							{
+								for (auto& info : buildInfos)
+								{
+									if (info.buildFlags.hasFlags(GeometryIsAABBFlag))
+										info.aabbs = aabbs.data()+reinterpret_cast<const size_t&>(info.aabbs);
+									else
+										info.triangles = triangles.data()+reinterpret_cast<const size_t&>(info.triangles);
+								}
+								for (auto& rangeInfo : rangeInfos)
+									rangeInfo = geometryRangeInfo.data()+reinterpret_cast<const size_t&>(rangeInfo);
+							}
+							success = computeCmdBuf->cmdbuf->buildAccelerationStructures({buildInfos},rangeInfos.data());
+						}
+						// account for the in-progress allocation (we may be called from an overflow submit)
+						const auto oldAllocCount = allocOffsets.size()-alignments.size();
+						if (success)
+						{
+							submitsNeeded |= IQueue::FAMILY_FLAGS::COMPUTE_BIT;
+							// queue up a deferred allocation
+							if (oldAllocCount)
+								params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data(),params.compute->getFutureScratchSemaphore());
+						}
+						else
+						{
+							// release right away
+							if (oldAllocCount)
+								params.scratchForDeviceASBuild->multi_deallocate(oldAllocCount,allocOffsets.data(),allocSizes.data());
+							for (const auto& info : buildInfos)
+							{
+								const auto stagingFound = findInStaging.template operator()<CPUAccelerationStructure>(info.dstAS);
+								smart_refctd_ptr<const CPUAccelerationStructure> dummy; // already null at this point
+								markFailure("AS Build Command Recording",&dummy,&stagingFound->second);
+							}
+						}
+						allocOffsets.erase(allocOffsets.begin(),allocOffsets.begin()+oldAllocCount);
+						allocSizes.erase(allocSizes.begin(),allocSizes.begin()+oldAllocCount);
+						buildInfos.clear();
+						rangeInfos.clear();
+						if constexpr (IsTLAS)
+							trackedBLASes.clear();
+						else
+						{
+							geometryRangeInfo.clear();
+							triangles.clear();
+							aabbs.clear();
+						}
+					};
+
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures START");
+					computeCmdBuf->cmdbuf->endDebugMarker();
 					const auto& limits = physDev->getLimits();
-					core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>> dedupBLASesUsed;
-					dedupBLASesUsed.reserve(reservations.m_blasBuildMap.size());
-					for (auto& tlasToBuild : tlasesToBuild)
+					for (auto& asToBuild : asesToBuild)
 					{
-						dedupBLASesUsed.clear();
-						const auto as = tlasToBuild.gpuObj;
-						const auto pFoundHash = findInStaging.template operator()<ICPUTopLevelAccelerationStructure>(as);
+						auto& canonical = asToBuild.second.canonical;
+						const auto as = asToBuild.first;
+						const auto pFound = &findInStaging.template operator()<CPUAccelerationStructure>(as)->second;
 						const auto& backingRange = as->getCreationParams().bufferRange;
 						// checking ownership for the future on old buffer, but compacted will be made with same sharing creation parameters
-						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,*pFoundHash),computeFamily);
+						const auto finalOwnerQueueFamily = checkOwnership(backingRange.buffer.get(),params.getFinalOwnerQueueFamily(as,pFound->cacheKey.value),computeFamily);
 						if (finalOwnerQueueFamily==QueueFamilyInvalid)
 						{
-							markFailureInStaging("invalid Final Queue Family given by user callback",tlasToBuild.canonical,as,pFoundHash);
+							markFailure("invalid Final Queue Family given by user callback",&canonical,pFound);
 							continue;
 						}
-						const auto instances = tlasToBuild.canonical->getInstances();
-						const auto instanceCount = static_cast<uint32_t>(instances.size());
-						size_t instanceDataSize = 0;
-						// gather total input size and check dependants exist
-						for (const auto& instance : instances)
+						// clean up the allocation if we fail to make it to the end of loop for whatever reason
+						alignments.clear();
+						auto allocCount = 0;
+						auto deallocSrc = core::makeRAIIExiter([&params,&allocOffsets,&allocSizes,&alignments,&allocCount]()->void
+							{
+								const auto beginIx = allocSizes.size()-allocCount;
+								// if got to end of loop queue up the release of memory, otherwise release right away
+								if (allocCount)
+									params.scratchForDeviceASBuild->multi_deallocate(allocCount,allocOffsets.data()+beginIx,allocSizes.data()+beginIx);
+								allocOffsets.resize(beginIx);
+								allocSizes.resize(beginIx);
+								alignments.clear();
+							}
+						);
+						allocSizes.push_back(asToBuild.second.scratchSize);
+						alignments.push_back(limits.minAccelerationStructureScratchOffsetAlignment);
+						const bitflag<typename AccelerationStructure::BUILD_FLAGS> buildFlags = asToBuild.second.getBuildFlags();
+						if constexpr (IsTLAS)
 						{
-							// failed BLAS builds erase themselves from this map, so this checks if some BLAS used but which had to be built failed the build
-							const auto found = reservations.m_blasBuildMap.find(instance.getBase().blas.get());
-							if (found==reservations.m_blasBuildMap.end() || failedBLASBarrier && found->second.buildDuringConvertCall)
+							const auto instances = canonical->getInstances();
+							// gather total input size and check dependants exist
+							size_t instanceDataSize = 0;
+							bool dependsOnBLASBuilds = false;
+							const auto& instanceMap = asToBuild.second.instanceMap;
+							for (const auto& instance : instances)
 							{
-								instanceDataSize = 0;
-								break;
+								auto found = instanceMap.find(instance.getBase().blas.get());
+								assert(instanceMap.end()!=found);
+								const auto depInfo = missingDependent.template operator()<ICPUBottomLevelAccelerationStructure>(found->second.get());
+								if (depInfo)
+								{
+									instanceDataSize = 0;
+									break;
+								}
+								if (depInfo.wasInStaging)
+									dependsOnBLASBuilds = true;
+								instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
+							}
+							// problem with building some Dependent BLASes
+							if (failedBLASBarrier && dependsOnBLASBuilds)
+							{
+								markFailure("building BLASes which current TLAS build wants to instance",&canonical,pFound);
+								continue;
+							}
+							// problem with finding the dependents (BLASes)
+							if (instanceDataSize==0)
+							{
+								markFailure("finding valid Dependant GPU BLASes for TLAS build",&canonical,pFound);
+								continue;
+							}
+							allocSizes.push_back(instanceDataSize);
+							alignments.push_back(16);
+							if (as->usesMotion())
+							{
+								allocSizes.push_back(sizeof(void*)*instances.size());
+								alignments.push_back(alignof(uint64_t));
 							}
-							instanceDataSize += ITopLevelAccelerationStructure::getInstanceSize(instance.getType());
 						}
-						// problem with finding the dependents (BLASes)
-						if (instanceDataSize==0)
+						else
 						{
-							markFailureInStaging("finding valid Dependant GPU BLASes for TLAS build",tlasToBuild.canonical,as,pFoundHash);
-							continue;
+							const uint32_t* pPrimitiveCounts = canonical->getGeometryPrimitiveCounts().data();
+							if (buildFlags.hasFlags(GeometryIsAABBFlag))
+							{
+								for (const auto& geom : canonical->getAABBGeometries())
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
+								{
+									allocSizes.push_back(aabbCount*geom.stride);
+									alignments.push_back(alignof(float));
+								}
+							}
+							else
+							{
+								for (const auto& geom : canonical->getTriangleGeometries())
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
+								{
+									auto size = geom.vertexStride*(geom.vertexData[1] ? 2:1)*(geom.maxVertex+1);
+									uint16_t alignment = hlsl::max(0x1u<<hlsl::findLSB(geom.vertexStride),32u);
+									if (geom.hasTransform())
+									{
+										size = core::alignUp(size,alignof(float))+sizeof(hlsl::float32_t3x4);
+										alignment = hlsl::max<uint16_t>(alignof(float),alignment);
+									}
+									uint16_t indexSize = 0u;
+									switch (geom.indexType)
+									{
+										case E_INDEX_TYPE::EIT_16BIT:
+											indexSize = alignof(uint16_t);
+											break;
+										case E_INDEX_TYPE::EIT_32BIT:
+											indexSize = alignof(uint32_t);
+											break;
+										default:
+											break;
+									}
+									if (indexSize)
+									{
+										size = core::alignUp(size,indexSize)+triCount*3*indexSize;
+										alignment = hlsl::max<uint16_t>(indexSize,alignment);
+									}
+									allocSizes.push_back(size);
+									alignments.push_back(alignment);
+									const auto tmp = asToBuild.second.scratchSize;
+									//logger.log("%p Triangle Data Size %d Align %d Scratch Size %d",system::ILogger::ELL_DEBUG,canonical.get(),size,alignment,tmp);
+								}
+							}
 						}
-						// allocate scratch and build inputs
-						constexpr uint32_t MaxAllocCount = 3;
-						addr_t offsets[MaxAllocCount] = {scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value,scratch_allocator_t::invalid_value};
-						const addr_t sizes[MaxAllocCount] = {tlasToBuild.scratchSize,instanceDataSize,sizeof(void*)*instanceCount};
+						allocOffsets.resize(allocSizes.size(),scratch_allocator_t::invalid_value);
+						// allocate out scratch or submit overflow, if fail then flush and keep trying till space is made
+						auto* offsets = allocOffsets.data()+allocOffsets.size()-alignments.size();
+						const auto* sizes = allocSizes.data()+allocSizes.size()-alignments.size();
+						//logger.log("%p Combined Size %d",system::ILogger::ELL_DEBUG,canonical.get(),std::accumulate(sizes,sizes+alignments.size(),0));
+						for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(alignments.size(),offsets,sizes,alignments.data())!=0; t++)
 						{
-							const addr_t alignments[MaxAllocCount] = {limits.minAccelerationStructureScratchOffsetAlignment,16,8};
-/* TODO: move to reserve phase - prevent CPU hangs by making sure allocator big enough to service us
-{
-addr_t worstSize = sizes[0];
-for (auto i=1u; i<AllocCount; i++)
-	worstSize = core::alignUp(worstSize,alignments[i])+sizes[i];
-if (worstSize>minScratchSize)
-	minScratchSize = worstSize;
-}*/
-							const auto AllocCount = as->usesMotion() ? 2:3;
-							// if fail then flush and keep trying till space is made
-							for (uint32_t t=0; params.scratchForDeviceASBuild->multi_allocate(AllocCount,&offsets[0],&sizes[0],&alignments[0])!=0u; t++)
 							if (t==1) // don't flush right away cause allocator not defragmented yet
 							{
 								recordBuildCommands();
+								// the submit overflow deallocates old offsets and erases them from the temp arrays, pointer changes
+								offsets = allocOffsets.data();
+								sizes = allocSizes.data();
 								// if writing to scratch directly, flush the writes
 								if (!flushRanges.empty())
 								{
@@ -4604,13 +4868,28 @@ if (worstSize>minScratchSize)
 								}
 								drainCompute();
 							}
-							// queue up a deferred allocation
-							params.scratchForDeviceASBuild->multi_deallocate(AllocCount,&offsets[0],&sizes[0],params.compute->getFutureScratchSemaphore());
+							// we may be preventing ourselves from allocating memory, with one successful allocation still being alive and fragmenting our allocator
+							params.scratchForDeviceASBuild->multi_deallocate(alignments.size(),offsets,sizes);
+							std::fill_n(offsets,alignments.size(),scratch_allocator_t::invalid_value);
 						}
-						// stream the instance/geometry input in
+						// now upon a failure, our allocations will need to be deallocated
+						allocCount = alignments.size();
+						// prepare build infos
+						typename AccelerationStructure::DeviceBuildInfo buildInfo;
+						buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+						buildInfo.buildFlags = buildFlags;
+						buildInfo.dstAS = as;
+						// abortion backup
+						bool success = true;
+						const auto geometryRangeInfoOffset = geometryRangeInfo.size();
+						const auto trianglesOffset = triangles.size();
+						const auto aabbsOffset = aabbs.size();
+						const size_t trackedBLASesOffset = trackedBLASes.size();
+						if constexpr (IsTLAS)
 						{
-							bool success = true;
-// TODO: make sure the overflow submit work callback is doing some CPU work
+							const auto instances = canonical->getInstances();
+							const auto instanceCount = static_cast<uint32_t>(instances.size());
+							// stream the instance/geometry input in
 							{
 								struct FillInstances : IUtilities::IUpstreamingDataProducer
 								{
@@ -4620,35 +4899,39 @@ if (worstSize>minScratchSize)
 										assert(offsetInRange%16==0);
 											
 										uint32_t bytesWritten = 0;
-										while (true)
+										while (instanceIndex<instances.size())
 										{
 											const auto& instance = instances[instanceIndex++];
 											const auto type = instance.getType();
 											const auto size = ITopLevelAccelerationStructure::getInstanceSize(type);
 											const auto newWritten = bytesWritten+size;
-											if (newWritten>=blockSize)
-												return bytesWritten;
-											auto found = blasBuildMap->find(instance.getBase().blas.get());
-											assert(found!=blasBuildMap->end());
-											const auto& blas = found->second.gpuBLAS;
-											dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas.get()->getReferenceForDeviceOperations());
-											dedupBLASesUsed->emplace(blas);
-											if (--found->second.remainingUsages == 0)
-												blasBuildMap->erase(found);
+											if (newWritten>blockSize)
+												break;
+											auto found = instanceMap->find(instance.getBase().blas.get());
+											auto blas = found->second.get();
+											if (auto found=compactedBLASMap->find(blas); found!=compactedBLASMap->end())
+												blas = found->second.get();
+											trackedBLASes->emplace_back(blas);
+											dst = IGPUTopLevelAccelerationStructure::writeInstance(dst,instance,blas->getReferenceForDeviceOperations());
 											bytesWritten = newWritten;
 										}
+										return bytesWritten;
 									}
 
-									SReserveResult::cpu_to_gpu_blas_map_t* blasBuildMap;
-									core::unordered_set<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* dedupBLASesUsed;
+									const compacted_blas_map_t* compactedBLASMap;
+									core::vector<smart_refctd_ptr<const IGPUBottomLevelAccelerationStructure>>* trackedBLASes;
+									SReserveResult::SConvReqTLAS::cpu_to_gpu_blas_map_t* instanceMap;
 									std::span<const ICPUTopLevelAccelerationStructure::PolymorphicInstance> instances;
 									uint32_t instanceIndex = 0;
 								};
 								FillInstances fillInstances;
-								fillInstances.blasBuildMap = &reservations.m_blasBuildMap;
-								fillInstances.dedupBLASesUsed = &dedupBLASesUsed;
+								fillInstances.compactedBLASMap = &compactedBLASMap;
+								fillInstances.trackedBLASes = &trackedBLASes;
+								fillInstances.instanceMap = &asToBuild.second.instanceMap;
 								fillInstances.instances = instances;
 								success = streamDataToScratch(offsets[1],sizes[1],fillInstances);
+								// provoke refcounting bugs right away
+								asToBuild.second.instanceMap.clear();
 							}
 							if (success && as->usesMotion())
 							{
@@ -4678,43 +4961,130 @@ if (worstSize>minScratchSize)
 								fillInstancePointers.instanceAddress = scratchBuffer->getDeviceAddress()+offsets[1];
 								success = streamDataToScratch(offsets[2],sizes[2],fillInstancePointers);
 							}
-							// current recording buffer may have changed
-							xferCmdBuf = params.transfer->getCommandBufferForRecording();
-							if (!success)
+							//
+							buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
+							// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
+							buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
+							// be based cause vectors can grow
+							using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
+							buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(trackedBLASesOffset),trackedBLASes.size()-trackedBLASesOffset};
+							// no special extra byte offset into the instance buffer
+							rangeInfos.emplace_back(instanceCount,0u);
+						}
+						else
+						{
+							buildInfo.geometryCount = canonical->getGeometryCount();
+							const auto* offsetIt = offsets+1;
+							const auto* sizeIt = sizes+1;
+							const auto primitiveCounts = canonical->getGeometryPrimitiveCounts();
+							for (const auto count : primitiveCounts)
+								geometryRangeInfo.push_back({
+									.primitiveCount = count,
+									.primitiveByteOffset = 0,
+									.firstVertex = 0,
+									.transformByteOffset = 0
+								});	
+							const uint32_t* pPrimitiveCounts = primitiveCounts.data();
+							IUtilities::CMemcpyUpstreamingDataProducer memcpyCallback;
+							if (buildFlags.hasFlags(GeometryIsAABBFlag))
 							{
-								markFailureInStaging("Uploading Instance Data for TLAS build failed",tlasToBuild.canonical,as,pFoundHash);
-								continue;
+								for (const auto& geom : canonical->getAABBGeometries())
+								if (const auto aabbCount=*(pPrimitiveCounts++); aabbCount)
+								{
+									auto offset = *(offsetIt++);
+									memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.data.buffer->getPointer())+geom.data.offset;
+									if (!streamDataToScratch(offset,*(sizeIt++),memcpyCallback))
+										break;
+									aabbs.push_back({
+										.data = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)},
+										.stride = geom.stride,
+										.geometryFlags = geom.geometryFlags
+									});
+								}
+								buildInfo.aabbs = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::AABBs<IGPUBuffer>* const&>(aabbsOffset);
+							}
+							else
+							{
+								for (const auto& geom : canonical->getTriangleGeometries())
+								if (const auto triCount=*(pPrimitiveCounts++); triCount)
+								{
+									auto& outGeom = triangles.emplace_back();
+									const auto origSize = *(sizeIt++);
+									const auto origOffset = *(offsetIt++);
+									auto offset = origOffset;
+									auto size = geom.vertexStride*(geom.maxVertex+1);
+									for (auto i=0; i<2; i++)
+									if (geom.vertexData[i]) // could assert that it must be true for i==0
+									{
+										outGeom.vertexData[i] = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+										memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.vertexData[i].buffer->getPointer())+geom.vertexData[i].offset;
+										if (!streamDataToScratch(offset,size,memcpyCallback))
+											break;
+										offset += size;
+									}
+									if (geom.hasTransform())
+									{
+										offset = core::alignUp(offset,alignof(float));
+										outGeom.transform = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+										memcpyCallback.data = &geom.transform;
+										if (!streamDataToScratch(offset,sizeof(geom.transform),memcpyCallback))
+											break;
+										offset += sizeof(geom.transform);
+									}
+									switch (geom.indexType)
+									{
+										case E_INDEX_TYPE::EIT_16BIT: [[fallthrough]];
+										case E_INDEX_TYPE::EIT_32BIT:
+										{
+											const auto alignment = geom.indexType==E_INDEX_TYPE::EIT_16BIT ? alignof(uint16_t):alignof(uint32_t);
+											offset = core::alignUp(offset,alignment);
+											outGeom.indexData = {.offset=offset,.buffer=smart_refctd_ptr<const IGPUBuffer>(scratchBuffer)};
+											size = triCount*3*alignment;
+											memcpyCallback.data = reinterpret_cast<const uint8_t*>(geom.indexData.buffer->getPointer())+geom.indexData.offset;
+											success = streamDataToScratch(offset,size,memcpyCallback);
+											offset += size;
+											break;
+										}
+										default:
+											break;
+									}
+									assert(offset-origOffset<=origSize);
+									if (!success)
+										break;
+									outGeom.maxVertex = geom.maxVertex;
+									outGeom.vertexStride = geom.vertexStride;
+									outGeom.vertexFormat = geom.vertexFormat;
+									outGeom.indexType = geom.indexType;
+									outGeom.geometryFlags = geom.geometryFlags;
+								}
+								buildInfo.triangles = reinterpret_cast<const IGPUBottomLevelAccelerationStructure::Triangles<IGPUBuffer>* const&>(trianglesOffset);
 							}
-							// let go of canonical asset (may free RAM)
-							tlasToBuild.canonical = nullptr;
+							success = pPrimitiveCounts==primitiveCounts.data()+primitiveCounts.size();
+							rangeInfos.push_back(reinterpret_cast<const IGPUBottomLevelAccelerationStructure::BuildRangeInfo* const&>(geometryRangeInfoOffset));
 						}
-						// prepare build infos
-						auto& buildInfo = buildInfos.emplace_back();
-						buildInfo.scratch = {.offset=offsets[0],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
-						buildInfo.buildFlags = tlasToBuild.getBuildFlags();
-						buildInfo.instanceDataTypeEncodedInPointersLSB = as->usesMotion();
-						buildInfo.dstAS = as;
-						// note we don't build directly from staging, because only very small inputs could come from there and they'd impede the transfer efficiency of the larger ones
-						buildInfo.instanceData = {.offset=offsets[as->usesMotion() ? 2:1],.buffer=smart_refctd_ptr<IGPUBuffer>(scratchBuffer)};
-						// be based cause vectors can grow
+						if (!success)
 						{
-							const auto offset = trackedBLASes.size();
-							using p_p_BLAS_t = const IGPUBottomLevelAccelerationStructure**;
-							buildInfo.trackedBLASes = {reinterpret_cast<const p_p_BLAS_t&>(offset),dedupBLASesUsed.size()};
-							for (auto& blas : dedupBLASesUsed)
-								trackedBLASes.emplace_back(std::move(blas));
-
+							rangeInfos.resize(buildInfos.size());
+							geometryRangeInfo.resize(geometryRangeInfoOffset);
+							triangles.resize(trianglesOffset);
+							aabbs.resize(aabbsOffset);
+							trackedBLASes.resize(trackedBLASesOffset);
+							markFailure("Uploading Input Data for Accleration Structure build failed",&canonical,pFound);
+							continue;
 						}
-						// no special extra byte offset into the instance buffer
-						rangeInfos.emplace_back(instanceCount,0u);
+						buildInfos.emplace_back(std::move(buildInfo));
+						allocCount = 0;
+						// let go of canonical asset (may free RAM)
+						canonical = nullptr;
 						//
-						const bool willCompact = tlasToBuild.compact();
+						const bool willCompact = asToBuild.second.compact;
 						if (willCompact)
 							compactions.push_back(as);
 						// enqueue ownership release if necessary
 						if (finalOwnerQueueFamily!=IQueue::FamilyIgnored)
 						{
-							compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size());
+							if (willCompact)
+								compactedOwnershipReleaseIndices.push_back(ownershipTransfers.size());
 							ownershipTransfers.push_back({
 								.barrier = {
 									.dep = {
@@ -4728,136 +5098,186 @@ if (worstSize>minScratchSize)
 								.range = backingRange
 							});
 						}
-						else
+						else if (willCompact)
 							compactedOwnershipReleaseIndices.push_back(~0u);
 					}
-					reservations.m_blasBuildMap.clear();
 					// finish the last batch
 					recordBuildCommands();
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Build Acceleration Structures END");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+					// provoke refcounting bugs
+					asesToBuild.clear();
+					// flush all ranged before potential submit
 					if (!flushRanges.empty())
 					{
 						device->flushMappedMemoryRanges(flushRanges);
 						flushRanges.clear();
 					}
-					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
-					computeCmdBuf->cmdbuf->endDebugMarker();
 				}
-				tlasesToBuild.clear();
-				// compact
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes START");
-				computeCmdBuf->cmdbuf->endDebugMarker();
-				// compact needs to wait for Build then record queries
+
+				// Not messing around with listing AS backing buffers individually, ergonomics of that are null 
+				const asset::SMemoryBarrier readASInASCompactBarrier = {
+					.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+					.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
+					// TODO: do queries or query retrieval have a stage?
+					.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
+					.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
+				};
 				if (!compactions.empty() && 
 					pipelineBarrier(computeCmdBuf,{.memBarriers={&readASInASCompactBarrier,1}},"Failed to sync Acceleration Structure builds with compactions!") &&
 					computeCmdBuf->cmdbuf->resetQueryPool(queryPool.get(),0,compactions.size()) &&
 					computeCmdBuf->cmdbuf->writeAccelerationStructureProperties(compactions,IQueryPool::TYPE::ACCELERATION_STRUCTURE_COMPACTED_SIZE,queryPool.get(),0)
 				)
 				{
-					// submit cause host needs to read the queries
-					drainCompute();
+					// clean AS builds, pipeline barrier, query reset and writes need to get executed before we start waiting on the results
+					drainBoth();
 					// get queries
 					core::vector<size_t> sizes(compactions.size());
-					if (device->getQueryPoolResults(
-						queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),
-						bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT
-					))
+					if (!device->getQueryPoolResults(queryPool.get(),0,compactions.size(),sizes.data(),sizeof(size_t),bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT)|IQueryPool::RESULTS_FLAGS::_64_BIT))
 					{
-						auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void
-						{
-							logger.log("Failed to %s for \"%s\"", system::ILogger::ELL_ERROR,as->getObjectDebugName());
-						};
-						// TODO: normally we'd iteratively record as many compactions as we can, but we don't have a mechanism to release already compacted TLASes, we'd need to defer the writing of the TLAS to the Descriptor Set till later
-						// create and allocate backing buffers for compacted TLASes
-						core::vector<asset_cached_t<ICPUBuffer>> backingBuffers(compactions.size());
+						logger.log("Failed to Query %sLevelAccelerationStructure compacted sizes, skipping compaction!",system::ILogger::ELL_ERROR,IsTLAS ? "Top":"Bottom");
+						return {};
+					}
+					//
+					auto logFail = [logger](const char* msg, const IGPUAccelerationStructure* as)->void
+					{
+						logger.log("Failed to %s for \"%s\"",system::ILogger::ELL_ERROR,msg,as->getObjectDebugName());
+					};
+					// try to allocate memory for 
+					core::vector<asset_cached_t<ICPUBuffer>> backingBuffers(compactions.size());
+					{
+						MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator ? params.compactedASAllocator:device,logger);
+						// create
+						for (size_t i=0; i<compactions.size(); i++)
 						{
-							MetaDeviceMemoryAllocator deferredAllocator(params.compactedASAllocator,logger);
-							// create
-							for (size_t i=0; i<compactions.size(); i++)
+							const auto* as = static_cast<const AccelerationStructure*>(compactions[i]);
+							assert(as);
+							// silently skip if not worth it
+							if (!params.confirmCompact(sizes[i],as))
+							{
+								logger.log("Compaction not confirmed for \"%s\" would be compacted size is %d, original %d.",system::ILogger::ELL_DEBUG,as->getObjectDebugName(),sizes[i],as->getCreationParams().bufferRange.size);
+								continue;
+							}
+							// create backing buffer and request an allocation for it
 							{
-								const auto* as = static_cast<const IGPUTopLevelAccelerationStructure*>(compactions[i]);
-								assert(as);
-								// silently skip if not worth it
-								if (!params.confirmCompact(sizes[i],as))
+								const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get();
+								assert(oldBuffer);
+								// This is a Spec limit/rpomise we don't even expose it
+								constexpr size_t MinASBufferAlignment = 256u;
+								using usage_f = IGPUBuffer::E_USAGE_FLAGS;
+								IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage=usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}};
+								// same sharing setup as the previous AS buffer
+								creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount;
+								creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices;
+								auto buf = device->createBuffer(std::move(creationParams));
+								if (!buf)
+								{
+									logFail("create Buffer backing the Compacted Acceleration Structure",as);
 									continue;
-								smart_refctd_ptr<IGPUBuffer> buff;
+								}
+								auto bufReqs = buf->getMemoryReqs();
+								backingBuffers[i].value = std::move(buf);
+								// allocate new memory - definitely don't want to be raytracing from across the PCIE slot
+								if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits()))
 								{
-									const auto* oldBuffer = as->getCreationParams().bufferRange.buffer.get();
-									assert(oldBuffer);
-									//
-									constexpr size_t MinASBufferAlignment = 256u;
-									using usage_f = IGPUBuffer::E_USAGE_FLAGS;
-									IGPUBuffer::SCreationParams creationParams = { {.size=core::roundUp(sizes[i],MinASBufferAlignment),.usage = usage_f::EUF_ACCELERATION_STRUCTURE_STORAGE_BIT|usage_f::EUF_SHADER_DEVICE_ADDRESS_BIT},{}};
-									creationParams.queueFamilyIndexCount = oldBuffer->getCachedCreationParams().queueFamilyIndexCount;
-									creationParams.queueFamilyIndices = oldBuffer->getCachedCreationParams().queueFamilyIndices;
-									auto buf = device->createBuffer(std::move(creationParams));
-									if (!buf)
-									{
-										logFail("create Buffer backing the Compacted Acceleration Structure",as);
-										continue;
-									}
-									// allocate new memory
-									auto bufReqs = buff->getMemoryReqs();
-									// definitely don't want to be raytracing from across the PCIE slot
-									if (!deferredAllocator.request(backingBuffers.data()+i,physDev->getDeviceLocalMemoryTypeBits()))
-									{
-										logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as);
-										continue;
-									}
-									backingBuffers[i].value = std::move(buf);
+									logFail("request of a Memory Allocation for the Buffer backing the Compacted Acceleration Structure",as);
+									continue;
 								}
 							}
-							// allocate memory for the buffers
-							deferredAllocator.finalize();
 						}
+						// allocate memory for the buffers
+						deferredAllocator.finalize();
+						unordered_map<const AccelerationStructure*,smart_refctd_ptr<AccelerationStructure>> retval;
+						retval.reserve(compactions.size());
 						// recreate Acceleration Structures
 						for (size_t i=0; i<compactions.size(); i++)
 						if (backingBuffers[i])
 						{
-							const auto* as = static_cast<const IGPUTopLevelAccelerationStructure*>(compactions[i]);
+							const auto* srcAS = static_cast<const AccelerationStructure*>(compactions[i]);
 							auto& backingBuffer = backingBuffers[i].value;
 							if (!backingBuffer->getBoundMemory().isValid())
 							{
-								logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",as);
-								continue; // reason to end a batch, see the TODO above
+								logFail("allocate Memory for the Buffer backing the Compacted Acceleration Structure",srcAS);
+								continue;
+							}
+							smart_refctd_ptr<AccelerationStructure> compactedAS;
+							{
+								typename AccelerationStructure::SCreationParams creationParams = {srcAS->getCreationParams()};
+								creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)};
+								if constexpr (IsTLAS)
+								{
+									creationParams.maxInstanceCount = srcAS->getMaxInstanceCount();
+									compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams));
+								}
+								else
+									compactedAS = device->createBottomLevelAccelerationStructure(std::move(creationParams));
 							}
-							IGPUTopLevelAccelerationStructure::SCreationParams creationParams = {as->getCreationParams()};
-							creationParams.bufferRange = {.offset=0,.size=sizes[i],.buffer=std::move(backingBuffer)};
-							creationParams.maxInstanceCount = as->getMaxInstanceCount();
-							auto compactedAS = device->createTopLevelAccelerationStructure(std::move(creationParams));
 							if (!compactedAS)
 							{
-								logFail("create the Compacted Acceleration Structure",as);
+								logFail("create the Compacted Acceleration Structure",srcAS);
 								continue;
 							}
 							// set the debug name
 							{
-								std::string debugName = as->getObjectDebugName();
+								std::string debugName = srcAS->getObjectDebugName();
 								debugName += " compacted";
 								compactedAS->setObjectDebugName(debugName.c_str());
 							}
 							// record compaction
-							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure({.src=as,.dst=compactedAS.get(),.mode=IGPUAccelerationStructure::COPY_MODE::COMPACT}))
+							if (!computeCmdBuf->cmdbuf->copyAccelerationStructure<AccelerationStructure>({.src=srcAS,.dst=compactedAS.get(),.compact=true}))
 							{
 								logFail("record Acceleration Structure compaction",compactedAS.get());
 								continue;
 							}
-							// modify the ownership release
+							// modify the ownership release to be for the final compacted AS
 							if (const auto ix=compactedOwnershipReleaseIndices[i]; ix<ownershipTransfers.size())
 								ownershipTransfers[ix].range = compactedAS->getCreationParams().bufferRange;
 							// swap out the conversion result
-							const auto foundIx = outputReverseMap.find(as);
+							const auto foundIx = outputReverseMap.find(srcAS);
 							if (foundIx!=outputReverseMap.end())
 							{
-								auto& resultOutput = std::get<SReserveResult::vector_t<ICPUTopLevelAccelerationStructure>>(reservations.m_gpuObjects);
+								auto& resultOutput = std::get<SReserveResult::vector_t<CPUAccelerationStructure>>(reservations.m_gpuObjects);
 								resultOutput[foundIx->second].value = compactedAS;
 							}
+							// overwrite staging cache
+							auto pFound = findInStaging.template operator()<CPUAccelerationStructure>(srcAS);
+							pFound->second.gpuRef = compactedAS;
 							// insert into compaction map
-							compactedTLASMap[as] = std::move(compactedAS);
+							retval[srcAS] = std::move(compactedAS);
 						}
+						return retval;
 					}
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures START");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+					computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact Acceleration Structures END");
+					computeCmdBuf->cmdbuf->endDebugMarker();
+				}
+				return {};
+			};
+
+			// compacted BLASes need to be substituted in cache and TLAS Build Inputs
+			compactedBLASMap = buildAndCompactASes.template operator()<IGPUBottomLevelAccelerationStructure>(blasesToBuild);
+			// Device TLAS builds
+			if (tlasCount)
+			{
+				// either we built no BLASes (remember we could retrieve already built ones from cache)
+				if (blasCount)
+				{
+					// Or we barrier for the previous compactions or builds (a single pipeline barrier to ensure BLASes build before TLASes is needed)
+					const asset::SMemoryBarrier readBLASInTLASBuildBarrier = {
+						// the last use of the source BLAS could have been a build or a compaction
+						.srcStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT|PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_COPY_BIT,
+						.srcAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_WRITE_BIT,
+						.dstStageMask = PIPELINE_STAGE_FLAGS::ACCELERATION_STRUCTURE_BUILD_BIT,
+						.dstAccessMask = ACCESS_FLAGS::ACCELERATION_STRUCTURE_READ_BIT
+					};
+					// submit because we want to launch BLAS builds in a separate submit, so the scratch semaphore can signal and free the scratch and more is available for TLAS builds
+					if (pipelineBarrier(computeCmdBuf,{.memBarriers={&readBLASInTLASBuildBarrier,1}},"Failed to sync BLAS with TLAS build!"))
+						drainBoth();
+					else
+						failedBLASBarrier = true;
 				}
-				computeCmdBuf->cmdbuf->beginDebugMarker("Asset Converter Compact TLASes END");
-				computeCmdBuf->cmdbuf->endDebugMarker();
+				compactedTLASMap = buildAndCompactASes.template operator()<IGPUTopLevelAccelerationStructure>(tlasesToBuild);
 			}
 
 			// release ownership
@@ -4880,7 +5300,7 @@ if (worstSize>minScratchSize)
 				retval.set({params.transfer->scratchSemaphore.semaphore,params.transfer->scratchSemaphore.value});
 		}
 		// reset original callback
-		params.transfer->overflowCallback = origXferStallCallback;
+		params.transfer->overflowCallback = std::move(origXferStallCallback);
 		
 		// Its too dangerous to leave an Intended Transfer Submit hanging around that needs to be submitted for Compute to make forward progress outside of this utility,
 		// and doing transfer-signals-after-compute-wait timeline sema tricks are not and option because:
@@ -4898,187 +5318,147 @@ if (worstSize>minScratchSize)
 		}
 	}
 	
+	// finish host tasks if not done yet
+	hostUploadBuffers([]()->bool{return true;});
+	// in the future we'll also finish host image copies
 
-	// Descriptor Sets need their TLAS descriptors substituted if they've been compacted
-	// want to check if deps successfully exist
-	auto missingDependent = [&reservations]<Asset AssetType>(const typename asset_traits<AssetType>::video_t* dep)->bool
-	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		auto found = stagingCache.find(const_cast<typename asset_traits<AssetType>::video_t*>(dep));
-		// this only checks if whether we had to convert and failed
-		if (found!=stagingCache.end() && found->second.value==CHashCache::NoContentHash)
-			return true;
-		// but the dependent might be in readCache of one or more converters, so if in doubt assume its okay
-		return false;
-	};
-	// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
-	auto mergeCache = [&]<Asset AssetType>()->void
+	// check dependents before inserting into cache
+	if (reservations.m_queueFlags.value!=IQueue::FAMILY_FLAGS::NONE)
 	{
-		auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
-		auto& cache = std::get<CCache<AssetType>>(m_caches);
-		cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
-		cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
-		constexpr bool IsTLAS = std::is_same_v<AssetType,ICPUTopLevelAccelerationStructure>;
-		for (auto& item : stagingCache)
-		if (item.second.value!=CHashCache::NoContentHash) // didn't get wiped
+		auto checkDependents = [&]<Asset AssetType>()->void
 		{
-			// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
-			bool depsMissing = false;
-			// only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
-			if constexpr (IsTLAS)
-			{
-				// A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job
-			}
-
-			if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
-				depsMissing = missingDependent.template operator()<ICPUBuffer>(item.first->getUnderlyingBuffer());
-			if constexpr (std::is_same_v<AssetType,ICPUImageView>)
-				depsMissing = missingDependent.template operator()<ICPUImage>(item.first->getCreationParameters().image.get());
-			if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
-			{
-				const IGPUDescriptorSetLayout* layout = item.first->getLayout();
-				// check samplers
-				{
-					const auto count = layout->getTotalMutableCombinedSamplerCount();
-					const auto* samplers = item.first->getAllMutableCombinedSamplers();
-					for (auto i=0u; !depsMissing && i<count; i++)
-					if (samplers[i])
-						depsMissing = missingDependent.template operator()<ICPUSampler>(samplers[i].get());
-				}
-				for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
+			auto& stagingCache = std::get<SReserveResult::staging_cache_t<AssetType>>(reservations.m_stagingCaches);
+			phmap::erase_if(stagingCache,[&](auto& item)->bool
 				{
-					const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
-					const auto count = layout->getTotalDescriptorCount(type);
-					auto* psDescriptors = item.first->getAllDescriptors(type);
-					if (!psDescriptors)
-						continue;
-					for (auto i=0u; !depsMissing && i<count; i++)
+					auto* pGpuObj = item.first;
+					// rescan all the GPU objects and find out if they depend on anything that failed, if so add to failure set
+					bool depsMissing = false;
+					if constexpr (std::is_same_v<AssetType,ICPUBufferView>)
+						depsMissing = missingDependent.template operator()<ICPUBuffer>(pGpuObj->getUnderlyingBuffer());
+					if constexpr (std::is_same_v<AssetType,ICPUImageView>)
+						depsMissing = missingDependent.template operator()<ICPUImage>(pGpuObj->getCreationParameters().image.get());
+					if constexpr (std::is_same_v<AssetType,ICPUDescriptorSet>)
 					{
-						auto* untypedDesc = psDescriptors[i].get();
-						if (untypedDesc)
-						switch (asset::IDescriptor::GetTypeCategory(type))
+						const IGPUDescriptorSetLayout* layout = pGpuObj->getLayout();
+						// check samplers
 						{
-							case asset::IDescriptor::EC_BUFFER:
-								depsMissing = missingDependent.template operator()<ICPUBuffer>(static_cast<const IGPUBuffer*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_SAMPLER:
-								depsMissing = missingDependent.template operator()<ICPUSampler>(static_cast<const IGPUSampler*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_IMAGE:
-								depsMissing = missingDependent.template operator()<ICPUImageView>(static_cast<const IGPUImageView*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_BUFFER_VIEW:
-								depsMissing = missingDependent.template operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
-								break;
-							case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
+							const auto count = layout->getTotalMutableCombinedSamplerCount();
+							const auto* samplers = pGpuObj->getAllMutableCombinedSamplers();
+							for (auto i=0u; !depsMissing && i<count; i++)
+							if (samplers[i])
+								depsMissing = missingDependent.template operator()<ICPUSampler>(samplers[i].get());
+						}
+						for (auto i=0u; !depsMissing && i<static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); i++)
+						{
+							const auto type = static_cast<asset::IDescriptor::E_TYPE>(i);
+							const auto count = layout->getTotalDescriptorCount(type);
+							auto* psDescriptors = pGpuObj->getAllDescriptors(type);
+							if (!psDescriptors)
+								continue;
+							for (auto i=0u; !depsMissing && i<count; i++)
 							{
-								const auto* tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc);
-								// successfully written a TLAS into the binding, nothing to check
-								if (tlas)
-									break;
-								// we have a null TLAS in the binding, and we have to check if we were supposed to have one in it
-								using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
-								const redirect_t& redirect = layout->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
-								const auto bindingRange = redirect.findBindingStorageIndex(redirect_t::storage_offset_t(i));
-								const auto firstElementOffset = redirect.getStorageOffset(bindingRange).data;
-								const auto foundWrite = reservations.m_deferredTLASDescriptorWrites.find({
-									.dstSet = item.first,
-									.binding = redirect.getBinding(bindingRange).data,
-									.arrayElement = i-firstElementOffset
-								});
-								// was scheduled to write some TLAS to this binding, but TLAS is now null
-								depsMissing = foundWrite!=reservations.m_deferredTLASDescriptorWrites.end() && !foundWrite->tlas;
-								break;
+								auto* untypedDesc = psDescriptors[i].get();
+								if (untypedDesc)
+								switch (asset::IDescriptor::GetTypeCategory(type))
+								{
+									case asset::IDescriptor::EC_BUFFER:
+										depsMissing = missingDependent.template operator()<ICPUBuffer>(static_cast<const IGPUBuffer*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_SAMPLER:
+										depsMissing = missingDependent.template operator()<ICPUSampler>(static_cast<const IGPUSampler*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_IMAGE:
+										depsMissing = missingDependent.template operator()<ICPUImageView>(static_cast<const IGPUImageView*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_BUFFER_VIEW:
+										depsMissing = missingDependent.template operator()<ICPUBufferView>(static_cast<const IGPUBufferView*>(untypedDesc));
+										break;
+									case asset::IDescriptor::EC_ACCELERATION_STRUCTURE:
+										depsMissing = missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(static_cast<const IGPUTopLevelAccelerationStructure*>(untypedDesc));
+										break;
+									default:
+										assert(false);
+										depsMissing = true;
+										break;
+								}
 							}
-							default:
-								assert(false);
-								depsMissing = true;
-								break;
 						}
 					}
+					if (depsMissing)
+					{
+						smart_refctd_ptr<const AssetType> dummy;
+						// I know what I'm doing (breaking the promise of the `erase_if` to not mutate the inputs)
+						markFailure("because conversion of a dependant failed!",&dummy,&item.second);
+					}
+					return depsMissing;
 				}
-			}
-			auto* pGpuObj = item.first;
-			if (depsMissing)
-			{
-				const auto* hashAsU64 = reinterpret_cast<const uint64_t*>(item.second.value.data);
-				logger.log("GPU Obj %s not writing to final cache because conversion of a dependant failed!", system::ILogger::ELL_ERROR, getLoggingLabel(*pGpuObj));
-				// wipe self, to let users know
-				item.second.value = {};
-				continue;
-			}
-			// The BLASes don't need to do this, because no-one checks for them as dependents and we can substitute the `item.first` in the staging cache right away
-			// For TLASes we need to write the compacted TLAS and not the intermediate build to the Cache
-			if constexpr (IsTLAS)
+			);
+		};
+		// Bottom up, only go over types we could actually break via missing upload/build (i.e. pipelines are unbreakable)
+		// A built TLAS cannot be queried about the BLASes it contains, so just trust the pre-TLAS-build input validation did its job
+		checkDependents.template operator()<ICPUBufferView>();
+		checkDependents.template operator()<ICPUImageView>();
+		checkDependents.template operator()<ICPUDescriptorSet>();
+//		mergeCache.template operator()<ICPUFramebuffer>();
+		// overwrite the compacted TLASes in Descriptor Sets
+		if (auto& tlasRewriteSet=reservations.m_potentialTLASRewrites; !tlasRewriteSet.empty())
+		{
+			core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
+			writes.reserve(tlasRewriteSet.size());
+			core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(tlasRewriteSet.size());
+			auto* pInfo = infos.data();
+			for (auto& entry : tlasRewriteSet)
 			{
-				auto found = compactedTLASMap.find(pGpuObj);
-				if (found!=compactedTLASMap.end())
-					pGpuObj = found->second.get();
-
+				auto* const dstSet = entry.dstSet;
+				// we need to check if the descriptor set itself didn't get deleted in the meantime
+				if (missingDependent.template operator()<ICPUDescriptorSet>(dstSet))
+					continue;
+				// rewtrieve the binding from the TLAS
+				const auto* const tlas = static_cast<const IGPUTopLevelAccelerationStructure*>(dstSet->getAllDescriptors(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE)[entry.storageOffset.data].get());
+				assert(tlas);
+				// only rewrite if successfully compacted
+				if (const auto foundCompacted=compactedTLASMap.find(tlas); foundCompacted!=compactedTLASMap.end())
+				{
+					pInfo->desc = foundCompacted->second;
+					using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
+					const redirect_t& redirect = dstSet->getLayout()->getDescriptorRedirect(IDescriptor::E_TYPE::ET_ACCELERATION_STRUCTURE);
+					const auto bindingRange = redirect.findBindingStorageIndex(entry.storageOffset);
+					const auto firstElementOffset = redirect.getStorageOffset(bindingRange);
+					writes.push_back({
+						.dstSet = dstSet,
+						.binding = redirect.getBinding(bindingRange).data,
+						.arrayElement = entry.storageOffset.data-firstElementOffset.data,
+						.count = 1,
+						.info = pInfo++
+					});
+				}
 			}
-			// We have success now, but ask callback if we write to the new cache.
-			if (!params.writeCache(item.second)) // TODO: let the user know the pointer to the GPU Object too?
-				continue;
-			asset_cached_t<AssetType> cached;
-			cached.value = core::smart_refctd_ptr<typename asset_traits<AssetType>::video_t>(pGpuObj);
-			cache.m_reverseMap.emplace(pGpuObj,item.second);
-			cache.m_forwardMap.emplace(item.second,std::move(cached));
+			// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
+			if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
+				logger.log("Failed to write one of the compacted TLASes into a Descriptor Set, all Descriptor Sets will still use non-compacted TLASes",system::ILogger::ELL_ERROR);
 		}
-	};
-	// again, need to go bottom up so we can check dependencies being successes
-	mergeCache.template operator()<ICPUBuffer>();
-	mergeCache.template operator()<ICPUImage>();
-	mergeCache.template operator()<ICPUBottomLevelAccelerationStructure>();
-	mergeCache.template operator()<ICPUTopLevelAccelerationStructure>();
-	mergeCache.template operator()<ICPUBufferView>();
-	mergeCache.template operator()<ICPUImageView>();
-	mergeCache.template operator()<IShader>();
-	mergeCache.template operator()<ICPUSampler>();
-	mergeCache.template operator()<ICPUDescriptorSetLayout>();
-	mergeCache.template operator()<ICPUPipelineLayout>();
-	mergeCache.template operator()<ICPUPipelineCache>();
-	mergeCache.template operator()<ICPUComputePipeline>();
-	mergeCache.template operator()<ICPURenderpass>();
-	mergeCache.template operator()<ICPUGraphicsPipeline>();
-	// write the TLASes into Descriptor Set finally
-	if (auto& tlasWriteMap=reservations.m_deferredTLASDescriptorWrites; !tlasWriteMap.empty())
+	}
+
+	// insert items into cache if overflows handled fine and commandbuffers ready to be recorded
+	core::for_each_in_tuple(reservations.m_stagingCaches,[&]<typename AssetType>(SReserveResult::staging_cache_t<AssetType>& stagingCache)->void
 	{
-		core::vector<IGPUDescriptorSet::SWriteDescriptorSet> writes;
-		writes.reserve(tlasWriteMap.size());
-		core::vector<IGPUDescriptorSet::SDescriptorInfo> infos(writes.size());
-		auto* pInfo = infos.data();
-		for (auto& inWrite : tlasWriteMap)
+		auto& cache = std::get<CCache<AssetType>>(m_caches);
+		cache.m_forwardMap.reserve(cache.m_forwardMap.size()+stagingCache.size());
+		cache.m_reverseMap.reserve(cache.m_reverseMap.size()+stagingCache.size());
+		for (auto& item : stagingCache)
+		if (item.second.gpuRef) // not wiped
 		{
-			// I know what I'm doing, this member has no influence on the set key hash
-			auto& tlas = const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas);
-			assert(tlas);
-			if (missingDependent.template operator()<ICPUTopLevelAccelerationStructure>(tlas.get()))
-			{
-				tlas = nullptr;
+			// We have success now, but ask callback if we write to the new cache.
+			if (!params.writeCache(item.second.cacheKey)) // TODO: let the user know the pointer to the GPU Object too?
 				continue;
-			}
-			if (const auto foundCompacted=compactedTLASMap.find(tlas.get()); foundCompacted!=compactedTLASMap.end())
-				tlas = foundCompacted->second;
-			pInfo->desc = tlas;
-			writes.push_back({
-				.dstSet = inWrite.dstSet,
-				.binding = inWrite.binding,
-				.arrayElement = inWrite.arrayElement,
-				.count = 1,
-				.info = pInfo++
-			});
+			asset_cached_t<AssetType> cached;
+			cached.value = std::move(item.second.gpuRef);
+			cache.m_reverseMap.emplace(item.first,item.second.cacheKey);
+			cache.m_forwardMap.emplace(item.second.cacheKey,std::move(cached));
 		}
-		// not strictly necessary, just provoking refcounting bugs right away if they exist
-		compactedTLASMap.clear();
-		// if the descriptor write fails, we make the Descriptor Sets behave as-if the TLAS build failed (dep is missing)
-		if (!writes.empty() && !device->updateDescriptorSets(writes,{}))
-		for (auto& inWrite : tlasWriteMap)
-			const_cast<smart_refctd_ptr<IGPUTopLevelAccelerationStructure>&>(inWrite.tlas) = nullptr;
-	}
-	mergeCache.template operator()<ICPUDescriptorSet>();
-	// needed for the IGPUDescriptorSets to check if TLAS exists/was written, can be released now
-	reservations.m_deferredTLASDescriptorWrites.clear();
-//	mergeCache.template operator()<ICPUFramebuffer>();
+		// provoke refcounting bugs ASAP
+		stagingCache.clear();
+	});
 
 	// no submit was necessary, so should signal the extra semaphores from the host
 	if (!retval.blocking())
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 4c3bbaa03c..924c337cbe 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -39,7 +39,7 @@ auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) ->
 	const auto sharedMemoryPerInvocation = core::max(singlePixelStorage*4,info.sharedMemoryPerInvocation);
 	retval.sharedMemorySize = sharedMemoryPerInvocation*retval.workgroupSize;
 
-	const auto* layout = info.layout;
+	auto* layout = info.layout;
 
 	// 
 	const auto common = [&]()->std::string
@@ -66,7 +66,7 @@ struct ConstevalParameters
 	}();
 	auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr<ICPUComputePipeline>
 	{
-		auto shader = make_smart_refctd_ptr<const IShader>(
+		auto shader = make_smart_refctd_ptr<IShader>(
 			(common+"\n#include \""+mainPath+"\"\n").c_str(),
 			IShader::E_CONTENT_TYPE::ECT_HLSL,
 			mainPath
@@ -77,14 +77,16 @@ struct ConstevalParameters
 			source->setContentHash(source->computeContentHash());
 		}
 
-		ICPUComputePipeline::SCreationParams params = {};
-		params.layout = layout;
-		params.shader.entryPoint = "main";
-		params.shader.shader = shader.get();
-		params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SShaderSpecInfo::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize));
-		// needed for the prefix and reductions to work
-		params.shader.requireFullSubgroups = true;
-		return ICPUComputePipeline::create(params);
+		auto pipeline = ICPUComputePipeline::create(layout);
+		pipeline->getSpecInfo() = {
+			.shader = shader,
+			.entryPoint = "main",
+			.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(findMSB(limits.maxSubgroupSize)),
+		};
+		pipeline->getCachedCreationParams() = {
+			.requireFullSubgroups = true,
+		};
+		return pipeline;
 	};
 	// create blit pipeline
 	cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl");
diff --git a/tools/nsc/CMakeLists.txt b/tools/nsc/CMakeLists.txt
index 1582e9ecd6..bcdcbca531 100644
--- a/tools/nsc/CMakeLists.txt
+++ b/tools/nsc/CMakeLists.txt
@@ -6,8 +6,11 @@ set(GODBOLT_BINARY_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/compiler-explorer")
 set(GODBOLT_BINARY_PRETEST_DIRECTORY "${GODBOLT_BINARY_DIRECTORY}/pre-test")
 set(NBL_NSC_COMPILE_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.compile/$<CONFIG>")
 set(NBL_NSC_PREINSTALL_DIRECTORY "${GODBOLT_BINARY_PRETEST_DIRECTORY}/.preinstall")
+make_directory("${NBL_NSC_PREINSTALL_DIRECTORY}")
 
 set(NBL_DOCKER_CT_NSC_VOLUME_SOURCE "${GODBOLT_BINARY_DIRECTORY}/install")
+set(NBL_DOCKER_CTX_DIR "${GODBOLT_BINARY_DIRECTORY}/.ctx")
+make_directory("${NBL_DOCKER_CTX_DIR}")
 
 set(NBL_DOCKER_INSTALL_BAT_FILENAME install-production.bat)
 set(NBL_DOCKER_CT_NSC_INSTALL_BAT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/${NBL_DOCKER_INSTALL_BAT_FILENAME}")
@@ -56,424 +59,304 @@ add_test(NAME NBL_NSC_DUMP_BUILD_INFO_TEST
 
 if(NBL_ENABLE_DOCKER_INTEGRATION)
 
-find_program(DOCKER_EXE
-	NAMES docker
-	REQUIRED
-)
-
-find_program(SPIRV_DIS_EXE
-	NAMES spirv-dis
-	HINTS "$ENV{VULKAN_SDK_INSTALL_DIRECTORY}/Bin"
-  HINTS "$ENV{VK_SDK_PATH}/Bin"
-  HINTS "$ENV{VULKAN_SDK}/Bin"
-	REQUIRED
-)
+find_program(DOCKER_EXE NAMES docker REQUIRED)
+set(BASE_IMAGE ghcr.io/devsh-graphics-programming/compiler-explorer-docker:nano-2022)
+set(CORE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022)
+
+function(PROMOTE_PROCESS_ISOLATION BASE VAR)
+    set(${VAR} True)
+    
+    macro(INSPECT IMAGE)
+      execute_process(COMMAND "${DOCKER_EXE}" inspect --format={{.OsVersion}} ${IMAGE} 
+        RESULT_VARIABLE INSPECTION_OK 
+        OUTPUT_VARIABLE TARGET_KERNEL 
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+    endmacro()
+
+    macro(TO_PROCESS IMAGE TARGET_KERNEL)
+      execute_process(COMMAND "${DOCKER_EXE}" run --rm --isolation process --entrypoint cmd ${BASE} /K 
+        RESULT_VARIABLE PROCESS_ISOLATION_OK 
+        OUTPUT_QUIET ERROR_QUIET
+      )
+
+      if(${PROCESS_ISOLATION_OK} EQUAL 0)
+          message(STATUS "Promoting \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation")
+      else()
+          set(${VAR} False)
+          message(STATUS "Cannot promote \"${IMAGE}\" [${TARGET_KERNEL}] to process isolation, requires falling back to HyperV. Please update your docker host OS.")
+      endif()
+    endmacro()
+
+    INSPECT(${BASE})
+
+    if(${INSPECTION_OK} EQUAL 0)
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    else()
+      message(STATUS "\"${BASE}\" not found in local registry, pulling...")
+      execute_process(COMMAND "${DOCKER_EXE}" pull ${BASE})
+
+      INSPECT(${BASE})
+      TO_PROCESS(${BASE} ${TARGET_KERNEL})
+    endif()
+
+    set(${VAR} ${${VAR}} PARENT_SCOPE)
+endfunction()
 
-cmake_path(GET Vulkan_INCLUDE_DIR PARENT_PATH VULKAN_SDK_INSTALL_DIRECTORY)
-get_filename_component(VULKAN_SDK_VERSION "${VULKAN_SDK_INSTALL_DIRECTORY}" NAME)
+PROMOTE_PROCESS_ISOLATION(${BASE_IMAGE} USE_PROCESS_ISOLATION)
 
-if(NOT EXISTS "${VULKAN_SDK_INSTALL_DIRECTORY}")
-  message(FATAL_ERROR "Internal error, VULKAN_SDK_INSTALL_DIRECTORY doesn't exist")
+if(NOT USE_PROCESS_ISOLATION)
+  # NOTE: we would need to use GET_RUNTIME_DEPENDENCIES which uses objdump
+  # https://cmake.org/cmake/help/latest/command/file.html#get-runtime-dependencies
+  # to collect *all* missing deps and copy (FROM at least server core) to destination nano
+  # image, it will fail currently if we fully isolate it with VM due to lack of certain DLLs
+  # BUT it means violating EULA, hence we are not going to support it, also (**)
+  message(FATAL_ERROR "HyperV is NOT supported! Update your OS!")
 endif()
 
-find_program(CTEST_EXE
-	NAMES ctest
-	REQUIRED
-)
+function(GET_LABEL BASE_IMAGE LABEL VAR)
+    set(FORMAT "{{ index .Config.Labels \"${LABEL}\" }}")
+    execute_process(COMMAND ${DOCKER_EXE} inspect --format=${FORMAT} ${BASE_IMAGE}
+        OUTPUT_VARIABLE OUT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_VARIABLE ERR
+        RESULT_VARIABLE RES
+    )
 
-set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/hlsl.local.properties.cmake")
+    if(NOT RES EQUAL 0)
+      message(WARNING "Could not get \"${LABEL}\" label from \"${BASE_IMAGE}\" image, it doesn't exist!")
+    endif()
 
-set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "C:\\\\nsc\\\\install")
-string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
-set(NSC_RELEASE_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-set(NSC_RELWITHDEBINFO_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/relwithdebinfo/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-set(NSC_DEBUG_BUILD_INFO "${NBL_NSC_PREINSTALL_DIRECTORY}/debug/${NBL_RELATIVE_ENTRY}/${NBL_NSC_BUILD_INFO_FILENAME}")
-cmake_path(NATIVE_PATH NSC_RELEASE_BUILD_INFO NORMALIZE NSC_RELEASE_BUILD_INFO)
-cmake_path(NATIVE_PATH NSC_RELWITHDEBINFO_BUILD_INFO NORMALIZE NSC_RELWITHDEBINFO_BUILD_INFO)
-cmake_path(NATIVE_PATH NSC_DEBUG_BUILD_INFO NORMALIZE NSC_DEBUG_BUILD_INFO)
-
-set(NBL_INSTALL_DIRECTORY "${NBL_DOCKER_CT_NSC_VOLUME_TARGET}")
-cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
-
-set(NBL_BUILD_INFO_POSTPROCESS_COMMAND
-  "${CMAKE_COMMAND}"
-  "-DNBL_EXECUTABLE_PATH=${NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH}"
-  "-DNBL_BUILD_INFO=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-  "-DNBL_OUTPUT_FILE=${NBL_NSC_PREINSTALL_TARGET_BUILD_INFO}"
-  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,${NBL_DOCKER_CT_NSC_VOLUME_TARGET}/${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}/${NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME}>" # as in CT, it's *not* host exe location!
-  -P "${NBL_ROOT_PATH}/cmake/scripts/nbl/nablaBuildInfo.cmake"
-)
+    set(${VAR} "${OUT}" PARENT_SCOPE)
+endfunction()
 
-cmake_path(GET SPIRV_DIS_EXE PARENT_PATH VULKAN_SDK_BIN_DIRECTORY)
-cmake_path(NATIVE_PATH VULKAN_SDK_BIN_DIRECTORY NORMALIZE VULKAN_SDK_BIN_DIRECTORY)
-cmake_path(GET SPIRV_DIS_EXE FILENAME SPIRV_DIS_EXE)
-set(CT_SPIRV_DIS_EXE "C:\\vulkan\\${VULKAN_SDK_VERSION}\\bin\\${SPIRV_DIS_EXE}")
-cmake_path(NATIVE_PATH CT_SPIRV_DIS_EXE NORMALIZE CT_SPIRV_DIS_EXE)
-
-set(NBL_CE_GENERATE_CONFIG_COMMAND
-  "${CMAKE_COMMAND}"
-  "-DSPIRV_DIS_EXE=${CT_SPIRV_DIS_EXE}"
-  "-DNSC_RELEASE_BUILD_INFO=${NSC_RELEASE_BUILD_INFO}"
-  "-DNSC_RELWITHDEBINFO_BUILD_INFO=${NSC_RELWITHDEBINFO_BUILD_INFO}"
-  "-DNSC_DEBUG_BUILD_INFO=${NSC_DEBUG_BUILD_INFO}"
-  "-DOUTPUT_CONFIG_FILE=${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}"
-  -P "${CMAKE_CURRENT_SOURCE_DIR}/ce-generate-config.cmake"
-)
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.title ORG_LABEL_TITLE)
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.source ORG_LABEL_SOURCE)
+GET_LABEL(${BASE_IMAGE} org.opencontainers.image.description ORG_LABEL_DESCRIPTION)
 
-set(NBL_DOCKER_CE_COMPOSE_BASE "${NBL_ROOT_PATH}/docker/compiler-explorer/compose.yml")
-cmake_path(NATIVE_PATH NBL_DOCKER_CE_COMPOSE_BASE NORMALIZE NBL_DOCKER_CE_COMPOSE_BASE)
-set(NBL_DOCKER_CE_COMPOSE_TARGET "${GODBOLT_BINARY_DIRECTORY}/.dev-compose.yml")
+find_program(CTEST_EXE NAMES ctest REQUIRED)
+find_file(DXIL_DLL NAMES dxil.dll HINTS "$ENV{CMAKE_WINDOWS_KITS_10_DIR}/Redist/D3D/x64" "C:/Program Files (x86)/Windows Kits/10/Redist/D3D/x64" REQUIRED)
 
-include(InstallRequiredSystemLibraries)
+set(ICU_GLOBALIZATION_DIR C:\\Windows\\Globalization\\ICU)
+find_file(UCRTBASED_DLL NAMES ucrtbased.dll HINTS ${UCRTBASED_DLL_DIR} REQUIRED)
 
-string(REPLACE "v" "VC" TARGET_DCRT ${CMAKE_VS_PLATFORM_TOOLSET})
-set(DEBUG_CRT_RELATIVE debug_nonredist/x64/Microsoft.${TARGET_DCRT}.DebugCRT)
-set(DEBUG_CRT_DIRECTORY_SOURCE "${MSVC_REDIST_DIR}/${DEBUG_CRT_RELATIVE}")
-cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE NBL_REDIST_DIR)
+find_program(SPIRV_DIS_EXE NAMES spirv-dis HINTS "${VULKAN_SDK}/Bin" REQUIRED)
+cmake_path(GET SPIRV_DIS_EXE PARENT_PATH SPIRV_DIS_DIR)
+cmake_path(NATIVE_PATH SPIRV_DIS_DIR NORMALIZE SPIRV_DIS_DIR)
 
-if(NOT EXISTS "${DEBUG_CRT_DIRECTORY_SOURCE}")
-  message(FATAL_ERROR "DEBUG_CRT_DIRECTORY_SOURCE = \"${DEBUG_CRT_DIRECTORY_SOURCE}\" doesn't exist!")
+if(MSVC_REDIST_BASE) # fallback to our toolset
+  set(MSVC_REDIST_DIR "${MSVC_REDIST_BASE}")
+else()
+  include(InstallRequiredSystemLibraries)
+  if(NOT MSVC_REDIST_DIR)
+    message(FATAL_ERROR "Could not find MSVC_REDIST_DIR, define yourself!")
+  endif()
 endif()
 
-set(DEBUG_CRT_DIRECTORY_TARGET "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}/.nonredist")
-file(MAKE_DIRECTORY "${DEBUG_CRT_DIRECTORY_TARGET}")
-file(GLOB CRT_FILES "${DEBUG_CRT_DIRECTORY_SOURCE}/*")
+cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
 
-find_file(UCRTBASED_DLL_PATH
-    NAMES ucrtbased.dll
-    REQUIRED
+file(GLOB_RECURSE VC_MODULES LIST_DIRECTORIES false
+  "${TOOLSET_REDIST_PATH}/x64/*.CRT/*.dll"
+  "${TOOLSET_REDIST_PATH}/debug_nonredist/x64/*.DebugCRT/*.dll"
 )
 
-# TODO: (***) ---> THIS GOES TO <NBL_ROOT_DIR>/docker to CMakeLists.txt file!
-
-set(BASE_IMAGE mcr.microsoft.com/windows/servercore:ltsc2022-amd64) # NOTE: HARDCODED CURRENTLY
-
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/package/vulkan:latest" DOCKER_VULKAN_TAG)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/toolset/redist/${CMAKE_CXX_COMPILER_ID}/crt:latest" DOCKER_CRT_TAG)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/devel-compiler-explorer-nsc:latest" DOCKER_DEVEL_TAG)
-
-cmake_path(NATIVE_PATH MSVC_REDIST_DIR NORMALIZE TOOLSET_REDIST_PATH)
-get_filename_component(REDIST_CRT_TOOLSET_VERSION "${TOOLSET_REDIST_PATH}" NAME)
+if(NOT VC_MODULES)
+  message(FATAL_ERROR "Failed to GLOB for VC Redist modules!")
+endif()
 
-function(GEN_DOCKER_CONTENT _CTX_ _OUTPUT_DIRECTORY_ _EXTRA_DOCKERFILE_CONTENT_ _DOCKER_IGNORE_CONTENT_ _S_NAME_ _CT_NAME_ _IMAGE_NAME_ _WITH_BUILD_) 
+make_directory("${NBL_DOCKER_CTX_DIR}/Runtimes")
+make_directory("${NBL_DOCKER_CTX_DIR}/Nabla")
+execute_process(
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${DXIL_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${UCRTBASED_DLL}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${SPIRV_DIS_EXE}" "${NBL_DOCKER_CTX_DIR}/Runtimes"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different ${VC_MODULES} "${NBL_DOCKER_CTX_DIR}/Runtimes"
+)
 
-set(_OUTPUT_D_PATH_ "${_OUTPUT_DIRECTORY_}/Dockerfile")
-set(_OUTPUT_C_PATH_ "${_OUTPUT_DIRECTORY_}/compose.yml")
+set(CT_RUNTIMES C:/runtimes)
+cmake_path(NATIVE_PATH CT_RUNTIMES NORMALIZE CT_RUNTIMES)
 
-string(CONFIGURE "${_EXTRA_DOCKERFILE_CONTENT_}" _EXTRA_DOCKERFILE_CONTENT_EVAL_ @ONLY)
-string(CONFIGURE "${_DOCKER_IGNORE_CONTENT_}" _DOCKER_IGNORE_CONTENT_EVAL_ @ONLY)
+set(NBL_DOCKER_CT_NSC_VOLUME_TARGET "${CT_RUNTIMES}/Nabla")
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_SOURCE NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_SOURCE)
+cmake_path(NATIVE_PATH NBL_DOCKER_CT_NSC_VOLUME_TARGET NORMALIZE NBL_DOCKER_CT_NSC_VOLUME_TARGET)
+cmake_path(NATIVE_PATH NBL_NSC_PREINSTALL_DIRECTORY NORMALIZE NBL_NSC_PREINSTALL_DIRECTORY)
 
-unset(DOCKER_CONTENT)
-string(APPEND DOCKER_CONTENT
-[=[
+string(CONFIGURE [=[
+# syntax=docker/dockerfile:1
 # escape=`
 
-ARG BASE_IMAGE=@BASE_IMAGE@
-FROM ${BASE_IMAGE}
-SHELL ["cmd", "/S", "/C"]
-@_EXTRA_DOCKERFILE_CONTENT_EVAL_@
-]=]
-)
+# ---------------- COMPRESS STEP ----------------
+FROM @BASE_IMAGE@ as compress
 
-string(CONFIGURE "${DOCKER_CONTENT}" DOCKER_CONTENT @ONLY)
-file(WRITE "${_OUTPUT_D_PATH_}" "${DOCKER_CONTENT}")
+COPY --link Runtimes/ C:/pack/Windows/System32/
+COPY --link Nabla/ C:/pack/runtimes/Nabla/
 
-set(_CTX_TARGET_ "${_OUTPUT_DIRECTORY_}/.ctx")
+ARG IMPL_COMPRESSION_OPTIONS=-T0
+ARG IMPL_COMPRESSION_LEVEL=3
 
-if("${_CTX_}" STREQUAL "")
+WORKDIR C:\pack
+RUN `
+tar -cf - Windows | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o windows-artifacts.tar.zst && `
+tar -cf - runtimes | zstd %IMPL_COMPRESSION_OPTIONS% -%IMPL_COMPRESSION_LEVEL% -o nabla-artifacts.tar.zst
 
-else()
-  if(NOT EXISTS "${_CTX_}")
-    message(FATAL_ERROR "Invalid source context directory doesn't exist! _CTX_: \"${_CTX_}\"")
-  endif()
+# ---------------- FINAL IMAGE ----------------
+FROM @BASE_IMAGE@
 
-  file(COPY "${_CTX_}" DESTINATION "${_CTX_TARGET_}")
-endif()
+COPY --link --from=compress ["C:/pack/windows-artifacts.tar.zst", "C:/pack/"]
+COPY --link --from=compress ["C:/pack/nabla-artifacts.tar.zst", "C:/pack/"]
+COPY hlsl.local.properties.cmake C:/Compiler-Explorer/etc/config/hlsl.local.properties
 
-set(_OUTPUT_I_PATH_ "${_CTX_TARGET_}/.dockerignore")
-
-unset(COMPOSE_CONTENT)
-string(APPEND COMPOSE_CONTENT
-[=[
-services:
-  @_S_NAME_@:
-    build:
-      context: ./.ctx
-      dockerfile: "@_OUTPUT_D_PATH_@"
-    image: @_IMAGE_NAME_@
-    container_name: @_CT_NAME_@
-    networks:
-      docker_default:
-
-networks:
-  docker_default:
-    external: true
-]=]
-)
+ENV NBL_INSTALL_DIRECTORY=@NBL_DOCKER_CT_NSC_VOLUME_TARGET@ `
+NBL_EXPLICIT_MODULE_LOAD_LOG=ON
 
-string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY)
-file(WRITE "${_OUTPUT_C_PATH_}" "${COMPOSE_CONTENT}")
-file(WRITE "${_OUTPUT_I_PATH_}" "${_DOCKER_IGNORE_CONTENT_EVAL_}")
+WORKDIR C:/Compiler-Explorer
+ENTRYPOINT [ `
+  "C:\\unpack.bat", "&&", `
+  "copy", "C:\\mount\\Windows\\System32\\icu.dll", "C:\\Windows\\System32\\icu.dll", "&&", `
+  "node", "--no-warnings", "--no-deprecation", "--import=tsx", "./app.js", "--language", "hlsl" `
+]
 
-if(_WITH_BUILD_)
-  execute_process(COMMAND "${DOCKER_EXE}" compose -f "${_OUTPUT_C_PATH_}" build)
-endif()
-endfunction()
+LABEL org.opencontainers.image.title="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_TITLE@"
+LABEL org.opencontainers.image.source=https://github.com/Devsh-Graphics-Programming/Nabla
+LABEL org.opencontainers.image.description="[Nabla Shader Compiler (NSC)]: @ORG_LABEL_DESCRIPTION@"
 
-# Vulkan
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/vulkan")
-set(CT_VULKAN_TARGET vulkan)
-GEN_DOCKER_CONTENT("${VULKAN_SDK_INSTALL_DIRECTORY}" "${OUTPUT_DIRECTORY}"
-[=[
-COPY ./ "@CT_VULKAN_TARGET@"
-
-ENV VULKAN_SDK="C:/@CT_VULKAN_TARGET@"
-ENV VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@"
-LABEL VULKAN_SDK="C:/@CT_VULKAN_TARGET@"
-LABEL VULKAN_SDK_VERSION="@VULKAN_SDK_VERSION@"
-]=]
-[=[
-*
-!@VULKAN_SDK_VERSION@/Bin/*.dll
-!@VULKAN_SDK_VERSION@/Bin/*spirv*.exe
-]=]
-nabla-dev-env-vulkan 
-nabla.dev.env.vulkan
-${DOCKER_VULKAN_TAG}
-ON
-)
+]=] INSTRUCTIONS @ONLY)
 
-# CRT
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/crt")
-set(CT_TOOLSET_REDIST_TARGET toolset_redist)
-make_directory("${OUTPUT_DIRECTORY}/.ctx")
-file(COPY "${UCRTBASED_DLL_PATH}" DESTINATION "${OUTPUT_DIRECTORY}/.ctx")
-GEN_DOCKER_CONTENT("${TOOLSET_REDIST_PATH}" "${OUTPUT_DIRECTORY}"
-[=[
-COPY ./ "/@CT_TOOLSET_REDIST_TARGET@"
-
-ENV REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@"
-ENV TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@"
-LABEL REDIST_CRT_TOOLSET_VERSION="@REDIST_CRT_TOOLSET_VERSION@"
-LABEL TOOLSET_REDIST_PATH="C:/@CT_TOOLSET_REDIST_TARGET@"
-]=]
-[=[
-*
-!ucrtbased.dll
-!@REDIST_CRT_TOOLSET_VERSION@/vc_redist.x64.exe
-!@REDIST_CRT_TOOLSET_VERSION@/@DEBUG_CRT_RELATIVE@/*.dll
-]=]
-nabla-dev-env-crt 
-nabla.dev.env.crt
-${DOCKER_CRT_TAG}
-ON
-)
+set(DOCKERFILE "${NBL_DOCKER_CTX_DIR}/Dockerfile")
+file(WRITE "${DOCKERFILE}" "${INSTRUCTIONS}")
 
-# Devel, combined
-set(BASE_IMAGE dr.devsh.eu/compiler-explorer/windows)
+if(DEFINED ENV{NSC_IMAGE_NAME})
+  set(NSC_IMAGE_NAME "$ENV{NSC_IMAGE_NAME}")
+else()
+  set(NSC_IMAGE_NAME nano/godbolt/nsc)
+endif()
 
-# NOTE to self: could be all done with single docker file & compose file but buildkit works bad with windows driver, yet need to wait for stuff to be implemented
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/devel")
-set(CT_REDIST_DIR "${CT_TOOLSET_REDIST_TARGET}/${REDIST_CRT_TOOLSET_VERSION}")
-set(CT_NONREDIST_CTR_DIR "${CT_REDIST_DIR}/${DEBUG_CRT_RELATIVE}")
-cmake_path(NATIVE_PATH CT_REDIST_DIR NORMALIZE CT_REDIST_DIR)
-cmake_path(NATIVE_PATH CT_NONREDIST_CTR_DIR NORMALIZE CT_NONREDIST_CTR_DIR)
-set(DEVEL_DOCKERFILE "${OUTPUT_DIRECTORY}/Dockerfile")
+set(NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT "${NBL_DOCKER_CTX_DIR}/hlsl.local.properties.cmake")
+string(GENEX_STRIP "${NBL_PACKAGE_RUNTIME_EXE_DIR_PATH}" NBL_RELATIVE_ENTRY)
+set(OUTPUT_CONFIG_FILE $<PATH:NORMAL_PATH,${NBL_DOCKER_NSC_COMPILER_CONFIG_OUTPUT}>)
+
+set(ICU_DIR C:\\Windows\\Globalization\\ICU)
+set(ICU_DLL C:\\Windows\\System32\\icu.dll)
+if(NOT EXISTS ${ICU_DIR} OR NOT EXISTS ${ICU_DLL})
+  # fallback for CI purposes, NOTE: we do NOT distribute those in final image as we have host runner requirements (**)
+  message(STATUS "\"${ICU_DIR}\" or \"${ICU_DLL}\ not found, fallback: copying them to the runner from \"${CORE_IMAGE}\"")
+  execute_process(COMMAND "${DOCKER_EXE}" rm -f nano-orphan RESULT_VARIABLE res)
+  execute_process(COMMAND "${DOCKER_EXE}" run -di --isolation process --name nano-orphan --entrypoint cmd ${CORE_IMAGE} COMMAND_ERROR_IS_FATAL ANY)
+  execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DIR} ${ICU_DIR} COMMAND_ERROR_IS_FATAL ANY)
+  execute_process(COMMAND "${DOCKER_EXE}" cp nano-orphan:${ICU_DLL} ${ICU_DLL} COMMAND_ERROR_IS_FATAL ANY)
+  message(STATUS "Fallback completed, runner patched!")
+endif()
 
-GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}"
-[=[
+set(ORPHAN nsc-orphan)
 
-COPY --from=@DOCKER_VULKAN_TAG@ /@CT_VULKAN_TARGET@ /@CT_VULKAN_TARGET@
-COPY --from=@DOCKER_CRT_TAG@ /@CT_TOOLSET_REDIST_TARGET@ /@CT_TOOLSET_REDIST_TARGET@
+if(NOT DEFINED NBL_CE_PUBLISH_PORT)
+  set(NBL_CE_PUBLISH_PORT 80)
+endif()
 
-RUN .\@CT_REDIST_DIR@\vc_redist.x64.exe /quiet /install 
-RUN xcopy .\@CT_NONREDIST_CTR_DIR@\*.dll %SystemRoot%\System32 /Y
-RUN xcopy .\@CT_TOOLSET_REDIST_TARGET@\ucrtbased.dll %SystemRoot%\System32 /Y
+if(NBL_DOCKER_DIND_BUILD)
+  set(NBL_CE_URL http://${ORPHAN}:${NBL_CE_PUBLISH_PORT})
+else()
+  set(NBL_CE_URL http://localhost:${NBL_CE_PUBLISH_PORT})
+endif()
 
-]=]
-[=[
+set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
+set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
+set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
 
-]=]
-nabla-dev-env-nsc 
-nabla.dev.env.nsc
-${DOCKER_DEVEL_TAG}
-OFF
+# to avoid "too long input" errors we proxy build instructions to CMake script and write it to build directory
+string(CONFIGURE [=[
+message(STATUS "Killing remaining NSC orphans")
+execute_process(COMMAND "@DOCKER_EXE@" 
+  rm -f "@ORPHAN@" 
+  RESULT_VARIABLE res
 )
 
-# <---(***)
-
-set(NABLA_DEV_ENV_CT_NAME dev.nabla.env.${CMAKE_SYSTEM_NAME}.${CMAKE_CXX_COMPILER_ID}.base)
-string(TOLOWER "${NABLA_DEV_ENV_CT_NAME}" NABLA_DEV_ENV_CT_NAME)
-
-set(COMPOSE_NSC_DEV_SERVICE compiler-explorer-nsc-dev)
-string(TOLOWER "dr.devsh.eu/nabla/cmake-host-dev-env/${CMAKE_SYSTEM_NAME}/build/${CMAKE_CXX_COMPILER_ID}/compiler-explorer-nsc:latest" COMPOSE_NSC_DEV_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-production-test:latest" COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc/orphan-prodution-cache:latest" COMPOSE_NSC_PRODUCTION_CACHE_IMAGE)
-string(TOLOWER "dr.devsh.eu/compiler-explorer/production/windows/nsc:latest" COMPOSE_NSC_PRODUCTION_IMAGE)
-
-string(APPEND COMPOSE_CONTENT
-[=[
-services:
-  @COMPOSE_NSC_DEV_SERVICE@:
-    container_name: dev.ce.nsc.dev
-    extends:
-        file: @NBL_DOCKER_CE_COMPOSE_BASE@
-        service: compiler-explorer
-    build:
-      context: ./.ctx
-      dockerfile: @DEVEL_DOCKERFILE@
-    image: @COMPOSE_NSC_DEV_IMAGE@
-    environment:
-      NBL_INSTALL_DIRECTORY: "@NBL_INSTALL_DIRECTORY@"
-      NBL_EXPLICIT_MODULE_LOAD_LOG: "ON"
-    entrypoint:
-      - "cmd"
-      - "/c"
-      - >
-        copy C:\\nsc\\install\\hlsl.local.properties.cmake %GIT_GODBOLT_REPOSITORY_PATH%\\etc\\config\\hlsl.local.properties
-        && npm --prefix %GIT_GODBOLT_REPOSITORY_PATH% run dev -- --language hlsl
-    volumes:
-      - type: bind
-        source: .\install
-        target: @NBL_DOCKER_CT_NSC_VOLUME_TARGET@
-        read_only: true
-
-networks:
-  docker_default:
-    external: true
-]=]
+message(STATUS "Executing CTests")
+execute_process(COMMAND "@CTEST_EXE@" 
+  -C "$<CONFIG>" --stop-on-failure 
+  WORKING_DIRECTORY "@CMAKE_CURRENT_BINARY_DIR@"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-string(CONFIGURE "${COMPOSE_CONTENT}" COMPOSE_CONTENT @ONLY)
-file(WRITE "${NBL_DOCKER_CE_COMPOSE_TARGET}" "${COMPOSE_CONTENT}")
-make_directory("${GODBOLT_BINARY_DIRECTORY}/.ctx")
-
-execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_BASE}" build)
-execute_process(COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" build)
-
-string(APPEND BAT_PRODUCTION_INSTALL
-[=[
-@echo off
-setlocal
-
-set BASE_PATH=C:\
-
-xcopy "%BASE_PATH%target" "%BASE_PATH%nsc\install" /s /e /h /i /y /f
-if %ERRORLEVEL% neq 0 (
-    echo [ERROR] Failed to copy C:\target to C:\nsc\install
-    exit /b %ERRORLEVEL%
+message(STATUS "Generating NSC build info")
+execute_process(COMMAND "@CMAKE_COMMAND@"
+  "-DNBL_EXECUTABLE_PATH=@NBL_NSC_PREINSTALL_TARGET_EXE_FILEPATH@"
+  "-DNBL_BUILD_INFO=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@"
+  "-DNBL_OUTPUT_FILE=@NBL_NSC_PREINSTALL_TARGET_BUILD_INFO@"
+  "-DNBL_OUTPUT_EXE_OVERRIDE=$<PATH:NORMAL_PATH,@NBL_DOCKER_CT_NSC_VOLUME_TARGET@/@NBL_PACKAGE_RUNTIME_EXE_DIR_PATH@/@NBL_NSC_PREINSTALL_TARGET_EXE_FILENAME@>"
+  -P "@NBL_ROOT_PATH@/cmake/scripts/nbl/nablaBuildInfo.cmake"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-if "%GIT_GODBOLT_REPOSITORY_PATH%"=="" (
-    echo [ERROR] Environment variable GIT_GODBOLT_REPOSITORY_PATH is not set!
-    exit /b 1
+message(STATUS "Generating NSC godbolt config")
+execute_process(COMMAND "@CMAKE_COMMAND@"
+  "-DSPIRV_DIS_EXE=spirv-dis.exe"
+  "-DNSC_RELEASE_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DNSC_RELWITHDEBINFO_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/relwithdebinfo/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DNSC_DEBUG_BUILD_INFO=$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@/debug/@NBL_RELATIVE_ENTRY@/@NBL_NSC_BUILD_INFO_FILENAME@>"
+  "-DOUTPUT_CONFIG_FILE=@OUTPUT_CONFIG_FILE@"
+  -P "@CMAKE_CURRENT_SOURCE_DIR@/ce-generate-config.cmake"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-copy "%BASE_PATH%nsc\install\hlsl.local.properties.cmake" "%GIT_GODBOLT_REPOSITORY_PATH%\etc\config\hlsl.local.properties"
-if %ERRORLEVEL% neq 0 (
-    echo [ERROR] Failed to copy HLSL properties file
-    exit /b %ERRORLEVEL%
+message(STATUS "Updating NSC package context")
+execute_process(COMMAND "@CMAKE_COMMAND@" -E copy_directory_if_different
+  "$<PATH:NORMAL_PATH,@NBL_NSC_PREINSTALL_DIRECTORY@>"
+  "@NBL_DOCKER_CTX_DIR@/Nabla"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-echo [SUCCESS] All production files copied successfully.
-exit /b 0
-]=]
+message(STATUS "Building NSC Godbolt image")
+string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%dT%H:%M:%SZ" UTC)
+execute_process(COMMAND "@DOCKER_EXE@" build --isolation process
+  --label=org.opencontainers.image.created="${BUILD_TIMESTAMP}"
+  -f "@DOCKERFILE@" -t @NSC_IMAGE_NAME@ "@NBL_DOCKER_CTX_DIR@"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-string(CONFIGURE "${BAT_PRODUCTION_INSTALL}" BAT_PRODUCTION_INSTALL @ONLY)
-file(WRITE "${NBL_DOCKER_CT_NSC_INSTALL_BAT}" "${BAT_PRODUCTION_INSTALL}")
-
-set(NBL_CE_URL http://localhost:80)
-set(NBL_CE_HEALTHY_CHECK_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/ce_healthy_check.py")
-set(NBL_CE_ENDPOINT_PY "${NBL_ROOT_PATH}/docker/compiler-explorer/endpoint.py")
-set(NBL_NSC_BASIC_HLSL_JPAYLOAD "${CMAKE_CURRENT_SOURCE_DIR}/docker/godbolt/hlsl-basic-compile-payload.json")
+message(STATUS "Running new NSC orphan container")
+execute_process(COMMAND "@DOCKER_EXE@" run -di -p @NBL_CE_PUBLISH_PORT@:10240 --isolation process
+  --name "@ORPHAN@" --network docker_default
+  -v $<PATH:NORMAL_PATH,@ICU_DIR@:@ICU_DIR@:ro>
+  -v $<PATH:NORMAL_PATH,C:/Windows/System32:C:/mount/Windows/System32:ro>
+  @NSC_IMAGE_NAME@
+  COMMAND_ERROR_IS_FATAL ANY
+)
 
-add_custom_target(run-compiler-explorer
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Performing Pre-Test..."
-    COMMAND "${CTEST_EXE}" -C $<CONFIG> --stop-on-failure
-    COMMAND ${NBL_BUILD_INFO_POSTPROCESS_COMMAND}
-    COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" stop ${COMPOSE_NSC_DEV_SERVICE}
-    COMMAND ${NBL_CE_GENERATE_CONFIG_COMMAND}
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "OK! Performing executables hot-swap..."
-    COMMAND "${CMAKE_COMMAND}" -E copy_directory "${NBL_NSC_PREINSTALL_DIRECTORY}" "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}"
-    COMMAND "${DOCKER_EXE}" compose -f "${NBL_DOCKER_CE_COMPOSE_TARGET}" up -d ${COMPOSE_NSC_DEV_SERVICE}
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Checking health of Compiler Explorer service..."
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 25
-    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "Compiler Explorer is running, type \"localhost\" in your browser!"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Post-Checking if NSC is able to compile basic shader file..."
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}"
-    COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "OK! NSC is healthy."
-    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-    VERBATIM 
-    USES_TERMINAL
+message(STATUS "Healthy check")
+execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_HEALTHY_CHECK_PY@"
+  --url "@NBL_CE_URL@" --interval 5 --ticks 12
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-add_custom_target(is-compiler-explorer-running
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --ticks 1
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers
-    VERBATIM
-    USES_TERMINAL
+message(STATUS "Post Basic NSC shader compile check")
+execute_process(COMMAND "@_Python3_EXECUTABLE@" "@NBL_CE_ENDPOINT_PY@"
+  --url "@NBL_CE_URL@"
+  --endpoint /api/compiler/nsc_$<LOWER_CASE:$<CONFIG>>_upstream/compile
+  --method POST --json "@NBL_NSC_BASIC_HLSL_JPAYLOAD@"
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-# Production NSC image
-set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/docker/nsc-production")
-set(BASE_IMAGE "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}")
-set(NBL_DOCKER_TMP_PRODUCTION_TARGET "C:\\target")
-GEN_DOCKER_CONTENT("" "${OUTPUT_DIRECTORY}"
-[=[
-LABEL maintainer="Arkadiusz Lachowicz <a.lachowicz@devsh.eu>" `
-  org.opencontainers.image.authors="Arkadiusz Lachowicz <a.lachowicz@devsh.eu>" `
-  org.opencontainers.image.title="Compiler Explorer with Nabla Shader Compilers in Docker" `
-  org.opencontainers.image.description="Docker image to run Compiler Explorer instance with Nabla Shader Compilers" `
-  org.opencontainers.image.url="https://github.com/Devsh-Graphics-Programming/Nabla" `
-  org.opencontainers.image.source="https://github.com/Devsh-Graphics-Programming/Nabla" `
-  org.opencontainers.image.documentation="https://github.com/Devsh-Graphics-Programming/Nabla/tree/master/tools/nsc/docker"
-
-ENTRYPOINT ["powershell.exe", "-ExecutionPolicy", "Bypass", "-Command", "npm", "--prefix", "$env:GIT_GODBOLT_REPOSITORY_PATH", "start", "--", "--language", "hlsl"]
-]=]
-[=[
-
-]=]
-nsc-ce-production-cache-webpack
-nsc.ce.production.cache.webpack
-${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}
-OFF
+message(STATUS "Printing NSC container logs")
+execute_process(COMMAND "@DOCKER_EXE@" 
+  logs "@ORPHAN@" 
+  COMMAND_ERROR_IS_FATAL ANY
 )
 
-set(NBL_CE_URL http://localhost:6969)
-
-add_custom_target(create-production-compiler-explorer
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Removing any remaining pre-test orphan containers..." 
-    COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-run-test || "${CMAKE_COMMAND}" -E true
-    COMMAND "${DOCKER_EXE}" rm -f production-ce-orphan-cache-webpack || "${CMAKE_COMMAND}" -E true
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Creating pre-test production image..."
-    COMMAND "${DOCKER_EXE}" run -dit -v "${NBL_DOCKER_CT_NSC_VOLUME_SOURCE}:${NBL_DOCKER_TMP_PRODUCTION_TARGET}" --name production-ce-orphan-run-test --entrypoint "cmd" "${COMPOSE_NSC_DEV_IMAGE}"
-    COMMAND "${DOCKER_EXE}" exec production-ce-orphan-run-test "${NBL_DOCKER_TMP_PRODUCTION_TARGET}\\${NBL_DOCKER_INSTALL_BAT_FILENAME}"
-    COMMAND "${DOCKER_EXE}" stop production-ce-orphan-run-test
-    COMMAND "${DOCKER_EXE}" commit -m "Copy NSC install redists" production-ce-orphan-run-test "${COMPOSE_NSC_ORPHAN_PRODUCTION_TEST_IMAGE}"
-    COMMAND "${DOCKER_EXE}" compose build
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Running pre-test production image, caching webpack & running final checks..."
-    COMMAND "${DOCKER_EXE}" run -dit -p 6969:10240 --name production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_CACHE_IMAGE}"
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_HEALTHY_CHECK_PY}" --url "${NBL_CE_URL}" --interval 10 --ticks 35
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compilers --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_release_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_relwithdebinfo_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${_Python3_EXECUTABLE}" "${NBL_CE_ENDPOINT_PY}" --url "${NBL_CE_URL}" --endpoint /api/compiler/nsc_debug_upstream/compile --method POST --json "${NBL_NSC_BASIC_HLSL_JPAYLOAD}" --disable-cookies --timeout 69
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --blue "Passed all tests! Creating final production image..."
-    COMMAND "${DOCKER_EXE}" stop production-ce-orphan-cache-webpack
-    COMMAND "${DOCKER_EXE}" commit -m "Perform tests, cache webpack build" production-ce-orphan-cache-webpack "${COMPOSE_NSC_PRODUCTION_IMAGE}"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "Created final `${COMPOSE_NSC_PRODUCTION_IMAGE}` production image!"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "To run the production image, execute: 'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}',"
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "'docker run -p 80:10240 ${COMPOSE_NSC_PRODUCTION_IMAGE}'."
-    COMMAND "${CMAKE_COMMAND}" -E cmake_echo_color --green "The production image can be pushed safely to the public registry."
-    WORKING_DIRECTORY "${OUTPUT_DIRECTORY}"
+message(STATUS "OK! NSC container is healthy.")
+message(STATUS "Type \"@NBL_CE_URL@\" in your browser to use NSC with Godbolt!")
+]=] INSTRUCTIONS @ONLY)
+
+set(SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/run-compiler-explorer-$<CONFIG>.cmake")
+file(GENERATE OUTPUT ${SCRIPT_FILE} CONTENT "${INSTRUCTIONS}")
+
+add_custom_target(run-compiler-explorer ALL
+    COMMAND "${CMAKE_COMMAND}" -P ${SCRIPT_FILE}
     VERBATIM
-    USES_TERMINAL
+    COMMAND_EXPAND_LISTS
 )
 
 add_dependencies(run-compiler-explorer nsc)
 set_target_properties(run-compiler-explorer PROPERTIES FOLDER "Godbolt")
-set_target_properties(is-compiler-explorer-running PROPERTIES FOLDER "Godbolt")
-set_target_properties(create-production-compiler-explorer PROPERTIES FOLDER "Godbolt")
 
 endif()
\ No newline at end of file
diff --git a/tools/nsc/docker/README.md b/tools/nsc/docker/README.md
index 21f8f4e06d..d44eea9f81 100644
--- a/tools/nsc/docker/README.md
+++ b/tools/nsc/docker/README.md
@@ -1,16 +1,105 @@
-# NSC Docker Godbolt
+# NSC & Godbolt integration
 
-## Run NSC tool straight from build directory in compiler explorer docker container!
+## Run Compiler Explorer with NSC tool in docker container!
 
-Currently only Windows platform with target *x86_64* architecture is supported. Tested with Hyper-V isolation mode.
+https://github.com/user-attachments/assets/8d409477-92e4-4238-b5e5-637cfbdf7263
 
-### Requirements
+<p align="center">
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/packages/nabla-shader-compiler-nsc/image-badge.json" alt="Image Status" /></a>
+  <a href="https://github.com/Devsh-Graphics-Programming/Nabla/actions">
+    <img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/badges/nabla/build.json" alt="Build Status" /></a>
+  <a href="https://opensource.org/licenses/Apache-2.0">
+    <img src="https://img.shields.io/badge/license-Apache%202.0-blue" alt="License: Apache 2.0" /></a>
+  <a href="https://discord.gg/krsBcABm7u">
+    <img src="https://img.shields.io/discord/308323056592486420?label=discord&logo=discord&logoColor=white&color=7289DA" alt="Join our Discord" /></a>
+</p>
 
-- [***Docker Desktop***](https://www.docker.com/products/docker-desktop/)
+## Requirements
 
-### How To
+- Configured [***Docker***](https://docs.docker.com/desktop/setup/install/windows-install/) for Windows Containers
+- [Windows, Windows Server Core or Windows Server](<https://learn.microsoft.com/en-us/virtualization/windowscontainers/manage-containers/container-base-images>) with **minumum** x86_64 10.0.20348 build (2022 distributions)
 
-Switch docker to windows containers, configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` option (recommended Visual Studio generator) & build `run-compiler-explorer` target. After the build completes type `localhost` in your browser.
+> [!TIP]
+> type `cmd /ver` to see your build version
+
+> [!WARNING]  
+> You cannot run it on Windows Home Edition as it doesn't have `Containers` feature, visit Microsoft [docs](<https://learn.microsoft.com/en-gb/virtualization/windowscontainers/quick-start/set-up-environment?tabs=dockerce>) for more details
+
+> [!CAUTION]  
+> Hyper-V is **NOT** supported, you must run NSC Godbolt container as process
+
+## How to run image
+
+> [!IMPORTANT]  
+> If using Docker Desktop - first make sure you have switched to `Containers for Windows`, see image bellow. If you are CLI user and have client & daemon headless then use appropriate windows build context.
 
 ![Containers for Windows](https://user-images.githubusercontent.com/65064509/152947300-affca592-35a7-4e4c-a7fc-2055ce1ba528.png)
 
+> [!CAUTION]  
+> Examples bellow use `docker compose` to run the image but if you want to `docker run` then make sure to mount required system directories and expose port otherwise it will fail in runtime, see the [compose](<https://github.com/Devsh-Graphics-Programming/Nabla/blob/master/compose.yml#L6>) file for more details
+
+### from container registry
+
+execute
+
+```powershell
+curl -L https://raw.githubusercontent.com/Devsh-Graphics-Programming/Nabla/master/compose.yml | docker compose -f - up
+```
+
+or in Nabla checkout
+
+```powershell
+docker compose up
+```
+
+and type `localhost` in your browser.
+
+### from Nabla pipeline workflow artifacts
+
+> [!NOTE]
+> We publish container images to the GitHub Container Registry that include **only the Release variant** of NSC executables built with **MSVC**.  
+> However, our CI pipelines **build and test all configurations**. Compressed images for each configuration are uploaded as **workflow artifacts**.
+> Look for artifacts named:  
+> `<prefix>-msvc-<config>-nsc-godbolt-image`
+
+> [!NOTE]
+> To decompress image artifact you need [zstd](<https://github.com/facebook/zstd/releases>)
+
+Download workflow image artifact, unzip and
+
+```powershell
+zstd -d < <prefix>-msvc-<config>-nsc-godbolt-image.tar.zst | docker load
+```
+
+<details>
+<summary>Docker load example (click to expand)</summary>
+
+```  
+C:\Users\anastaziuk\Desktop\DevshGraphicsProgramming\Nabla\tools\nsc\docker>zstd -d < run-windows-17.13.6-msvc-Debug-nsc-godbolt-image.tar.zst | docker load
+b2ebf78c3627: Loading layer [==================================================>]  3.149MB/3.149MB
+4c201e14cc01: Loading layer [==================================================>]   77.4MB/77.4MB
+68a216251b8f: Loading layer [==================================================>]  61.95kB/61.95kB
+7a4e13ca4c4e: Loading layer [==================================================>]  52.74kB/52.74kB
+634001f55b21: Loading layer [==================================================>]  52.74kB/52.74kB
+6a609178bb9a: Loading layer [==================================================>]  52.74kB/52.74kB
+3d7afb042308: Loading layer [==================================================>]  52.74kB/52.74kB
+ca034d7bc58a: Loading layer [==================================================>]  52.74kB/52.74kB
+55b4134a1ae9: Loading layer [==================================================>]  52.74kB/52.74kB
+0648adff3faa: Loading layer [==================================================>]  52.74kB/52.74kB
+Loaded image: ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6
+```
+
+</details>
+
+copy `compose.yml` in Nabla root directory to eg. `override-compose.yml`, replace it's `image` field value with loaded image name (eg. `ghcr.io/devsh-graphics-programming/nabla:nsc-godbolt-build-msvc-debug-17.13.6` like in the example) then execute
+
+```
+docker compose -f override-compose.yml up
+```
+
+and type `localhost` in your browser.
+
+## How to build image
+
+Configure CMake with `NBL_ENABLE_DOCKER_INTEGRATION` and build `run-compiler-explorer` target.