From c4c8a05926db0332a3a7238e753c030038ac6191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 21:47:39 +0200 Subject: [PATCH 1/5] Remove the need for allowscalar --- src/reverse_mode.jl | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index f58125b..ad98c9e 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -92,8 +92,9 @@ function _reverse_mode(d::NLPEvaluator, x) end # Phase I for k in d.subexpression_order - d.subexpression_forward_values[k] = - _forward_eval(d.subexpressions[k], d, x) + _forward_eval(d.subexpressions[k], d, x) + # FIXME this assumes scalar output + d.subexpression_forward_values[k] = d.subexpressions[k].forward_storage[1] end if d.objective !== nothing _forward_eval(something(d.objective).expr, d, x) @@ -158,7 +159,7 @@ function _forward_eval( f::_SubexpressionStorage, d::NLPEvaluator, x::AbstractVector{T}, -)::T where {T} +) where {T} @assert length(f.forward_storage) >= length(f.nodes) @assert length(f.partials_storage) >= length(f.nodes) operators = d.data.operators @@ -633,10 +634,7 @@ function _forward_eval( f.partials_storage[rhs] = zero(T) end end - # Caller is responsible for reading the right range of `f.forward_storage` - # for vector-valued roots (use `_storage_range(f.sizes, 1)`); the scalar - # return is only meaningful when the root is scalar. - return f.forward_storage[1] + return end """ @@ -823,9 +821,9 @@ Reverse-mode evaluation of an expression tree given in `f`. * This function assumes that `f.reverse_storage` has been initialized with 0.0. """ function _reverse_eval( - f::_SubexpressionStorage, - seed::Union{Nothing,AbstractVector{Float64}} = nothing, -) + f::_SubexpressionStorage{T}, + seed::Union{Nothing,AbstractVector{T}} = nothing, +) where {T} @assert length(f.reverse_storage) >= _length(f.sizes) @assert length(f.partials_storage) >= _length(f.sizes) # f.nodes is already in order such that parents always appear before @@ -834,14 +832,10 @@ function _reverse_eval( children_arr = SparseArrays.rowvals(f.adj) root_range = _storage_range(f.sizes, 1) if seed === nothing - for i in root_range - f.reverse_storage[i] = one(Float64) - end + f.reverse_storage[root_range] .= one(T) else @assert length(seed) == length(root_range) - for (j, i) in enumerate(root_range) - f.reverse_storage[i] = seed[j] - end + f.reverse_storage[root_range] .= seed end for k in 1:length(f.nodes) node = f.nodes[k] @@ -1108,17 +1102,20 @@ function _reverse_eval( elseif op == :^ # We start with just .^2 to simplify @assert f.sizes.ndims[rhs] == 0 "Broadcasted ^ requires scalar exponent" - exp = _getscalar(f.forward_storage, f.sizes, rhs) - # To simplify, so we don't need to compute its derivative + # If it is a constant, we can just read it from the `const_values` and avoid a GPU->CPU communication + # We also don't need to compute its derivative + @assert f.nodes[rhs].type == NODE_VALUE + exponent = f.const_values[f.nodes[rhs].index] + @assert f.nodes[rhs].type == NODE_VALUE rev_parent = _view_linear(f.reverse_storage, f.sizes, k) rev_child = _view_linear(f.reverse_storage, f.sizes, lhs) - if exp == 2 + if exponent == 2 child = _view_linear(f.forward_storage, f.sizes, lhs) rev_child .= 2 .* child .* rev_parent - elseif exp == 1 + elseif exponent == 1 rev_child .= rev_parent else partial = From adef74a0176e205548f3c4b0fd2dd2bec97595f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 21:48:26 +0200 Subject: [PATCH 2/5] Fixes --- perf/arraydiff.jl | 16 +++++++--------- src/reverse_mode.jl | 3 ++- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl index 9d5b3f5..fe3f850 100644 --- a/perf/arraydiff.jl +++ b/perf/arraydiff.jl @@ -68,10 +68,10 @@ trial; the timed block is just `MOI.eval_objective_gradient` plus a `CUDA.synchronize` when `gpu=true`. """ function neural( - ::Type{T}, - h::Int, - d::Int, - n::Int; + ::Type{T} = Float32, + h::Int = 4096, + d::Int = 13, + n::Int = 178; gpu::Bool = false, ) where {T<:Real} state = _build(T, h, d, n, gpu) @@ -90,9 +90,7 @@ function neural( return @benchmark( begin if $gpu - # `@allowscalar` covers residual scalar leaves the BLOCK - # rewrite can't fold; the hot path is bulk. - CUDA.@allowscalar MOI.eval_objective_gradient( + MOI.eval_objective_gradient( $state.evaluator, $g, $x, @@ -111,13 +109,13 @@ function profile_gpu(; T = Float32, h = 4096, d = 13, n = 178) x = CUDA.CuVector{T}(vec(state.W1)) g = CUDA.zeros(T, h * d) fill!(state.evaluator.backend.last_x, NaN) - CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + CUDA.@sync MOI.eval_objective_gradient( state.evaluator, g, x, ) fill!(state.evaluator.backend.last_x, NaN) - return CUDA.@profile CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient( + return CUDA.@profile CUDA.@sync MOI.eval_objective_gradient( state.evaluator, g, x, diff --git a/src/reverse_mode.jl b/src/reverse_mode.jl index ad98c9e..7dd620e 100644 --- a/src/reverse_mode.jl +++ b/src/reverse_mode.jl @@ -94,7 +94,8 @@ function _reverse_mode(d::NLPEvaluator, x) for k in d.subexpression_order _forward_eval(d.subexpressions[k], d, x) # FIXME this assumes scalar output - d.subexpression_forward_values[k] = d.subexpressions[k].forward_storage[1] + d.subexpression_forward_values[k] = + d.subexpressions[k].forward_storage[1] end if d.objective !== nothing _forward_eval(something(d.objective).expr, d, x) From 4ded416d67bfa9f988d948b94cb7762363878c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 22:16:03 +0200 Subject: [PATCH 3/5] Fix format --- src/mathoptinterface_api.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mathoptinterface_api.jl b/src/mathoptinterface_api.jl index 72d35bc..ceb09e3 100644 --- a/src/mathoptinterface_api.jl +++ b/src/mathoptinterface_api.jl @@ -256,8 +256,10 @@ end # Forward-evaluate subexpressions and the residual at `x`. function _forward_pass_residual!(d::NLPEvaluator, x) for k in d.subexpression_order + _forward_eval(d.subexpressions[k], d, x) + # FIXME this assumes scalar output d.subexpression_forward_values[k] = - _forward_eval(d.subexpressions[k], d, x) + d.subexpressions[k].forward_storage[1] end _forward_eval(something(d.residual).expr, d, x) return From 1ac55a20107ebb9eb2ee89223f4a6da40d5ae14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 22:16:25 +0200 Subject: [PATCH 4/5] Fix format --- perf/arraydiff.jl | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/perf/arraydiff.jl b/perf/arraydiff.jl index fe3f850..a5ab78e 100644 --- a/perf/arraydiff.jl +++ b/perf/arraydiff.jl @@ -90,11 +90,7 @@ function neural( return @benchmark( begin if $gpu - MOI.eval_objective_gradient( - $state.evaluator, - $g, - $x, - ) + MOI.eval_objective_gradient($state.evaluator, $g, $x) CUDA.synchronize() else MOI.eval_objective_gradient($state.evaluator, $g, $x) @@ -109,11 +105,7 @@ function profile_gpu(; T = Float32, h = 4096, d = 13, n = 178) x = CUDA.CuVector{T}(vec(state.W1)) g = CUDA.zeros(T, h * d) fill!(state.evaluator.backend.last_x, NaN) - CUDA.@sync MOI.eval_objective_gradient( - state.evaluator, - g, - x, - ) + CUDA.@sync MOI.eval_objective_gradient(state.evaluator, g, x) fill!(state.evaluator.backend.last_x, NaN) return CUDA.@profile CUDA.@sync MOI.eval_objective_gradient( state.evaluator, From 27eb29761606885e4c5c2912e791295e3710eb10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Legat?= Date: Thu, 28 May 2026 22:23:30 +0200 Subject: [PATCH 5/5] Remove last allowscalar --- test/Optimisers_GPU.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test/Optimisers_GPU.jl b/test/Optimisers_GPU.jl index 762b8f9..61c1c70 100644 --- a/test/Optimisers_GPU.jl +++ b/test/Optimisers_GPU.jl @@ -50,11 +50,7 @@ function test_neural_optimisers_gpu() @objective(model, Min, loss) set_attribute(model, "max_iter", 20_000) set_attribute(model, "tol", 1e-6) - # The variable-load and gradient-extract paths still do scalar reads/writes - # against the GPU-resident tape (forward_storage, reverse_storage). Those - # are what `@allowscalar` permits. They will be batched in a follow-up; - # for now this test is a correctness check, not a performance benchmark. - CUDA.@allowscalar optimize!(model) + optimize!(model) @test objective_value(model) < 1e-3 return end