diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl index b79b44e..9d5b3f5 100644 --- a/perf/arraydiff.jl +++ b/perf/arraydiff.jl @@ -106,4 +106,22 @@ function neural( ) end +function profile_gpu(; T = Float32, h = 4096, d = 13, n = 178) + state = _build(T, h, d, n, true) + x = CUDA.CuVector{T}(vec(state.W1)) + g = CUDA.zeros(T, h * d) + fill!(state.evaluator.backend.last_x, NaN) + CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + state.evaluator, + g, + x, + ) + fill!(state.evaluator.backend.last_x, NaN) + return CUDA.@profile CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + state.evaluator, + g, + x, + ) +end + end # module diff --git a/perf/hand_cuda.jl b/perf/hand_cuda.jl index b69de98..5dd8d68 100644 --- a/perf/hand_cuda.jl +++ b/perf/hand_cuda.jl @@ -111,4 +111,30 @@ function neural( end end +function profile_gpu(; + T = Float32, + h = 4096, + d = 13, + n = 178, + prealloc::Bool = true, +) + Random.seed!(0) + W1 = randn(T, h, d) + W2 = randn(T, OUT_DIM, h) + X = randn(T, d, n) + y = randn(T, OUT_DIM, n) + W1g, W2g, Xg, yg = CuArray(W1), CuArray(W2), CuArray(X), CuArray(y) + CUDA.synchronize() + CUDA.@sync if prealloc + gradient!(Buffers{(typeof(W1g))}(h, d, n), W1g, W2g, Xg, yg) + else + gradient_alloc(W1g, W2g, Xg, yg) + end + return CUDA.@profile CUDA.@sync if prealloc + gradient!(Buffers{(typeof(W1g))}(h, d, n), W1g, W2g, Xg, yg) + else + gradient_alloc(W1g, W2g, Xg, yg) + end +end + end # module