From b7e311847ff4e0b34a5a3dfc893613bdc54993eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Wed, 27 May 2026 23:57:03 +0200 Subject: [PATCH 1/2] Add GPU profiles --- perf/arraydiff.jl | 20 ++++++++++++++++++++ perf/hand_cuda.jl | 24 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl index b79b44e..2581011 100644 --- a/perf/arraydiff.jl +++ b/perf/arraydiff.jl @@ -106,4 +106,24 @@ function neural( ) end +function profile_gpu( + ;T = Float32, h = 4096, d = 13, n = 178 +) + state = _build(T, h, d, n, true) + x = CUDA.CuVector{T}(vec(state.W1)) + g = CUDA.zeros(T, h * d) + fill!(state.evaluator.backend.last_x, NaN) + CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + state.evaluator, + g, + x, + ) + fill!(state.evaluator.backend.last_x, NaN) + return CUDA.@profile CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + state.evaluator, + g, + x, + ) +end + end # module diff --git a/perf/hand_cuda.jl b/perf/hand_cuda.jl index b69de98..0149fde 100644 --- a/perf/hand_cuda.jl +++ b/perf/hand_cuda.jl @@ -111,4 +111,28 @@ function neural( end end +function profile_gpu(; + T = Float32, h = 4096, d = 13, n = 178, + prealloc::Bool = true, +) + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + W1g, W2g, Xg, yg = CuArray(W1), CuArray(W2), CuArray(X), CuArray(y) + CUDA.synchronize() + CUDA.@sync if prealloc + gradient!(Buffers{(typeof(W1g))}(h, d, n), W1g, W2g, Xg, yg) + else + gradient_alloc(W1g, W2g, Xg, yg) + end + return CUDA.@profile CUDA.@sync if prealloc + gradient!(Buffers{(typeof(W1g))}(h, d, n), W1g, W2g, Xg, yg) + else + gradient_alloc(W1g, W2g, Xg, yg) + end +end + + end # module From 8bc418dbc1842d75702e6224005d6e629b1c3da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 20:30:05 +0200 Subject: [PATCH 2/2] Fix format --- perf/arraydiff.jl | 4 +--- perf/hand_cuda.jl | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl index 2581011..9d5b3f5 100644 --- a/perf/arraydiff.jl +++ b/perf/arraydiff.jl @@ -106,9 +106,7 @@ function neural( ) end -function profile_gpu( - ;T = Float32, h = 4096, d = 13, n = 178 -) +function profile_gpu(; T = Float32, h = 4096, d = 13, n = 178) state = _build(T, h, d, n, true) x = CUDA.CuVector{T}(vec(state.W1)) g = CUDA.zeros(T, h * d) diff --git a/perf/hand_cuda.jl b/perf/hand_cuda.jl index 0149fde..5dd8d68 100644 --- a/perf/hand_cuda.jl +++ b/perf/hand_cuda.jl @@ -112,7 +112,10 @@ function neural( end function profile_gpu(; - T = Float32, h = 4096, d = 13, n = 178, + T = Float32, + h = 4096, + d = 13, + n = 178, prealloc::Bool = true, ) Random.seed!(0) @@ -134,5 +137,4 @@ function profile_gpu(; end end - end # module