Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions perf/arraydiff.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ trial; the timed block is just `MOI.eval_objective_gradient` plus a
`CUDA.synchronize` when `gpu=true`.
"""
function neural(
::Type{T},
h::Int,
d::Int,
n::Int;
::Type{T} = Float32,
h::Int = 4096,
d::Int = 13,
n::Int = 178;
gpu::Bool = false,
) where {T<:Real}
state = _build(T, h, d, n, gpu)
Expand All @@ -90,13 +90,7 @@ function neural(
return @benchmark(
begin
if $gpu
# `@allowscalar` covers residual scalar leaves the BLOCK
# rewrite can't fold; the hot path is bulk.
CUDA.@allowscalar MOI.eval_objective_gradient(
$state.evaluator,
$g,
$x,
)
MOI.eval_objective_gradient($state.evaluator, $g, $x)
CUDA.synchronize()
else
MOI.eval_objective_gradient($state.evaluator, $g, $x)
Expand All @@ -111,13 +105,9 @@ function profile_gpu(; T = Float32, h = 4096, d = 13, n = 178)
x = CUDA.CuVector{T}(vec(state.W1))
g = CUDA.zeros(T, h * d)
fill!(state.evaluator.backend.last_x, NaN)
CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient(
state.evaluator,
g,
x,
)
CUDA.@sync MOI.eval_objective_gradient(state.evaluator, g, x)
fill!(state.evaluator.backend.last_x, NaN)
return CUDA.@profile CUDA.@sync CUDA.@allowscalar MOI.eval_objective_gradient(
return CUDA.@profile CUDA.@sync MOI.eval_objective_gradient(
state.evaluator,
g,
x,
Expand Down
4 changes: 3 additions & 1 deletion src/mathoptinterface_api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,10 @@ end
# Forward-evaluate subexpressions and the residual at `x`.
function _forward_pass_residual!(d::NLPEvaluator, x)
for k in d.subexpression_order
_forward_eval(d.subexpressions[k], d, x)
# FIXME this assumes scalar output
d.subexpression_forward_values[k] =
_forward_eval(d.subexpressions[k], d, x)
d.subexpressions[k].forward_storage[1]
end
_forward_eval(something(d.residual).expr, d, x)
return
Expand Down
36 changes: 17 additions & 19 deletions src/reverse_mode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ function _reverse_mode(d::NLPEvaluator, x)
end
# Phase I
for k in d.subexpression_order
_forward_eval(d.subexpressions[k], d, x)
# FIXME this assumes scalar output
d.subexpression_forward_values[k] =
_forward_eval(d.subexpressions[k], d, x)
d.subexpressions[k].forward_storage[1]
end
if d.objective !== nothing
_forward_eval(something(d.objective).expr, d, x)
Expand Down Expand Up @@ -158,7 +160,7 @@ function _forward_eval(
f::_SubexpressionStorage,
d::NLPEvaluator,
x::AbstractVector{T},
)::T where {T}
) where {T}
@assert length(f.forward_storage) >= length(f.nodes)
@assert length(f.partials_storage) >= length(f.nodes)
operators = d.data.operators
Expand Down Expand Up @@ -633,10 +635,7 @@ function _forward_eval(
f.partials_storage[rhs] = zero(T)
end
end
# Caller is responsible for reading the right range of `f.forward_storage`
# for vector-valued roots (use `_storage_range(f.sizes, 1)`); the scalar
# return is only meaningful when the root is scalar.
return f.forward_storage[1]
return
end

"""
Expand Down Expand Up @@ -823,9 +822,9 @@ Reverse-mode evaluation of an expression tree given in `f`.
* This function assumes that `f.reverse_storage` has been initialized with 0.0.
"""
function _reverse_eval(
f::_SubexpressionStorage,
seed::Union{Nothing,AbstractVector{Float64}} = nothing,
)
f::_SubexpressionStorage{T},
seed::Union{Nothing,AbstractVector{T}} = nothing,
) where {T}
@assert length(f.reverse_storage) >= _length(f.sizes)
@assert length(f.partials_storage) >= _length(f.sizes)
# f.nodes is already in order such that parents always appear before
Expand All @@ -834,14 +833,10 @@ function _reverse_eval(
children_arr = SparseArrays.rowvals(f.adj)
root_range = _storage_range(f.sizes, 1)
if seed === nothing
for i in root_range
f.reverse_storage[i] = one(Float64)
end
f.reverse_storage[root_range] .= one(T)
else
@assert length(seed) == length(root_range)
for (j, i) in enumerate(root_range)
f.reverse_storage[i] = seed[j]
end
f.reverse_storage[root_range] .= seed
end
for k in 1:length(f.nodes)
node = f.nodes[k]
Expand Down Expand Up @@ -1108,17 +1103,20 @@ function _reverse_eval(
elseif op == :^
# We start with just .^2 to simplify
@assert f.sizes.ndims[rhs] == 0 "Broadcasted ^ requires scalar exponent"
exp = _getscalar(f.forward_storage, f.sizes, rhs)
# To simplify, so we don't need to compute its derivative
# If it is a constant, we can just read it from the `const_values` and avoid a GPU->CPU communication
# We also don't need to compute its derivative
@assert f.nodes[rhs].type == NODE_VALUE
exponent = f.const_values[f.nodes[rhs].index]

@assert f.nodes[rhs].type == NODE_VALUE
rev_parent = _view_linear(f.reverse_storage, f.sizes, k)
rev_child =
_view_linear(f.reverse_storage, f.sizes, lhs)
if exp == 2
if exponent == 2
child =
_view_linear(f.forward_storage, f.sizes, lhs)
rev_child .= 2 .* child .* rev_parent
elseif exp == 1
elseif exponent == 1
rev_child .= rev_parent
else
partial =
Expand Down
6 changes: 1 addition & 5 deletions test/Optimisers_GPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,7 @@ function test_neural_optimisers_gpu()
@objective(model, Min, loss)
set_attribute(model, "max_iter", 20_000)
set_attribute(model, "tol", 1e-6)
# The variable-load and gradient-extract paths still do scalar reads/writes
# against the GPU-resident tape (forward_storage, reverse_storage). Those
# are what `@allowscalar` permits. They will be batched in a follow-up;
# for now this test is a correctness check, not a performance benchmark.
CUDA.@allowscalar optimize!(model)
optimize!(model)
@test objective_value(model) < 1e-3
return
end
Expand Down
Loading