diff --git a/Project.toml b/Project.toml index fcd1eee47..f853d714c 100644 --- a/Project.toml +++ b/Project.toml @@ -15,15 +15,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SuiteSparse_jll = "bea87d4a-7f5b-5778-9afe-8cc45184846c" [weakdeps] -AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924" Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" [extensions] -AppleAccelerateExt = "AppleAccelerate" MKLPardisoExt = "Pardiso" [compat] -AppleAccelerate = "^0.4" DataStructures = "^0.19" DocStringExtensions = "~0.8, ~0.9" HDF5 = "0.17" diff --git a/docs/src/reference/internals.md b/docs/src/reference/internals.md index cf5b63078..e3cc12b85 100644 --- a/docs/src/reference/internals.md +++ b/docs/src/reference/internals.md @@ -14,3 +14,14 @@ symbols are exported from `PowerNetworkMatrices`. ```@autodocs Modules = [PowerNetworkMatrices.KLUWrapper] ``` + +## `AccelerateWrapper` + +`PowerNetworkMatrices.AccelerateWrapper` is a thin, allocation-aware wrapper over Apple's +`libSparse.dylib` (provided by the system Accelerate framework) used internally for sparse +linear solves on macOS. Non-Apple builds load stub fallbacks that throw on use. None of +these symbols are exported from `PowerNetworkMatrices`. + +```@autodocs +Modules = [PowerNetworkMatrices.AccelerateWrapper] +``` diff --git a/ext/AppleAccelerateExt.jl b/ext/AppleAccelerateExt.jl deleted file mode 100644 index 8b2e48345..000000000 --- a/ext/AppleAccelerateExt.jl +++ /dev/null @@ -1,112 +0,0 @@ -module AppleAccelerateExt - -import PowerNetworkMatrices as PNM -using AppleAccelerate -import SparseArrays -import LinearAlgebra - -# Extend the factorization creation function -function PNM._create_apple_accelerate_factorization(ABA) - K = AppleAccelerate.AAFactorization(ABA) - AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT) - return K -end - -# Extend the solve function for AppleAccelerate factorizations -function PNM._solve_factorization( - K::AppleAccelerate.AAFactorization{Float64}, - b::Vector{Float64}, -) - return AppleAccelerate.solve(K, b) -end - -""" -Function for internal use only. - -Computes the PTDF matrix by means of AppleAccelerate for sparse matrices. - -# Arguments -- `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: - Incidence Matrix -- `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: - BA matrix -- `ref_bus_positions::Set{Int}`: - vector containing the indexes of the reference slack buses. -- `dist_slack::Vector{Float64}`: - vector containing the weights for the distributed slacks. -""" -function PNM._calculate_PTDF_matrix_AppleAccelerate( - A::SparseArrays.SparseMatrixCSC{Int8, Int}, - BA::SparseArrays.SparseMatrixCSC{Float64, Int}, - ref_bus_positions::Set{Int}, - dist_slack::Vector{Float64}) - @warn "AppleAccelerate solver is experimental and may produce unexpected results. If you need high confidence use KLU" - linecount = size(BA, 2) - buscount = size(BA, 1) - - ABA = PNM.calculate_ABA_matrix(A, BA, ref_bus_positions) - K = AppleAccelerate.AAFactorization(ABA) - AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT) - - # initialize matrices for evaluation - valid_ix = setdiff(1:buscount, ref_bus_positions) - PTDFm_t = zeros(buscount, linecount) - copyto!(PTDFm_t, BA) - if !isempty(dist_slack) && length(ref_bus_positions) != 1 - error( - "Distributed slack is not supported for systems with multiple reference buses.", - ) - elseif isempty(dist_slack) && length(ref_bus_positions) < buscount - PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :]) - PTDFm_t[collect(ref_bus_positions), :] .= 0.0 - return PTDFm_t - elseif length(dist_slack) == buscount - @info "Distributed bus" - PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :]) - PTDFm_t[collect(ref_bus_positions), :] .= 0.0 - slack_array = dist_slack / sum(dist_slack) - slack_array = reshape(slack_array, 1, buscount) - return PTDFm_t .- (slack_array * PTDFm_t) - else - error("Distributed bus specification doesn't match the number of buses.") - end - - return -end - -""" -Function for internal use only. - -Computes the LODF matrix by means of AppleAccelerate for sparse matrices. - -# Arguments -- `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: - Incidence Matrix -- `ptdf::Matrix{Float64}`: - PTDF matrix -""" -function PNM._calculate_LODF_matrix_AppleAccelerate( - a::SparseArrays.SparseMatrixCSC{Int8, Int}, - ptdf::Matrix{Float64}, -) - linecount = size(ptdf, 2) - ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - Dem_LU = AppleAccelerate.AAFactorization(SparseArrays.sparse(m_I, m_I, m_V)) - lodf_t = AppleAccelerate.solve(Dem_LU, ptdf_denominator_t) - lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0 - - return lodf_t -end - -end # module diff --git a/ext/MKLPardisoExt.jl b/ext/MKLPardisoExt.jl index ab9d02e04..1f124a47d 100644 --- a/ext/MKLPardisoExt.jl +++ b/ext/MKLPardisoExt.jl @@ -171,28 +171,15 @@ function PNM._calculate_LODF_matrix_MKLPardiso( a::SparseArrays.SparseMatrixCSC{Int8, Int}, ptdf::Matrix{Float64}, ) + # The demand matrix `diag(1 - PTDF·A[i,i])` is diagonal, so the + # "solve" is a row-wise element-wise division. The Pardiso path that + # used to factor it and back-solve via `_pardiso_sequential_LODF!` / + # `_pardiso_single_LODF!` is no longer needed for this stage. linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - lodf_t = zeros(linecount, linecount) - A = SparseArrays.sparse(m_I, m_I, m_V) - if linecount > PNM.DEFAULT_LODF_CHUNK_SIZE - PNM._pardiso_sequential_LODF!(lodf_t, A, ptdf_denominator_t) - else - PNM._pardiso_single_LODF!(lodf_t, A, ptdf_denominator_t) - end - lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0 - return lodf_t + m_V = PNM._build_lodf_demand(ptdf_denominator_t, linecount) + PNM._apply_lodf_demand!(ptdf_denominator_t, m_V) + return ptdf_denominator_t end end # module diff --git a/src/AccelerateWrapper/AccelerateWrapper.jl b/src/AccelerateWrapper/AccelerateWrapper.jl new file mode 100644 index 000000000..96d4f225a --- /dev/null +++ b/src/AccelerateWrapper/AccelerateWrapper.jl @@ -0,0 +1,72 @@ +""" + AccelerateWrapper + +A small, allocation-aware wrapper over Apple's `libSparse.dylib` +(`/System/Library/Frameworks/Accelerate.framework/.../libSparse.dylib`) +designed for the access patterns of `PowerNetworkMatrices`: + +- Cache the symbolic and numeric factorizations of a symmetric sparse matrix + (LDLT by default) and reuse them across many solves. +- Refresh the numeric factor (`numeric_refactor!`) while keeping the symbolic + analysis, without re-allocating the structural arrays. +- Solve dense and **sparse** right-hand sides in place, with the sparse path + packing only non-empty RHS columns into a bounded scratch block. +- Compute `A·X` and `A·x` directly via libSparse's `SparseMultiply`. + +This module is intentionally lighter than the upstream `AppleAccelerate.jl` +package: it owns no high-level Julia wrappers over libSparse, exposes the +symbolic/numeric split directly, binds only the entry points used by PNM, +and is compile-gated to macOS so non-Apple builds never codegen the +`@ccall` sites. +""" +module AccelerateWrapper + +import SparseArrays +import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros, nzrange +import LinearAlgebra + +export AAFactorCache, + aa_factorize, + symbolic_factor!, + numeric_refactor!, + full_factor!, + full_refactor!, + solve!, + solve_sparse!, + solve_sparse, + is_factored, + aa_spmm!, + aa_spmv! + +@static if Sys.isapple() + include("libsparse_bindings.jl") + include("aa_cache.jl") + include("solve_dense.jl") + include("solve_sparse_rhs.jl") + include("spmm.jl") +else + # Stub layer. Non-Apple builds never bind libSparse symbols, never codegen + # the `@ccall` sites, and never instantiate `SparseOpaqueFactorization`. + # The whole submodule reduces to these short bodies on Linux/Windows. + struct AAFactorCache end + + _unavailable() = error( + "AccelerateWrapper is macOS-only (Sys.isapple() returned false). " * + "Use the KLU backend on non-Apple platforms.", + ) + + AAFactorCache(args...; kwargs...) = _unavailable() + aa_factorize(args...; kwargs...) = _unavailable() + symbolic_factor!(args...) = _unavailable() + numeric_refactor!(args...) = _unavailable() + full_factor!(args...) = _unavailable() + full_refactor!(args...) = _unavailable() + solve!(args...) = _unavailable() + solve_sparse!(args...; kwargs...) = _unavailable() + solve_sparse(args...; kwargs...) = _unavailable() + is_factored(::AAFactorCache) = false + aa_spmm!(args...) = _unavailable() + aa_spmv!(args...) = _unavailable() +end + +end # module diff --git a/src/AccelerateWrapper/aa_cache.jl b/src/AccelerateWrapper/aa_cache.jl new file mode 100644 index 000000000..3723c62d3 --- /dev/null +++ b/src/AccelerateWrapper/aa_cache.jl @@ -0,0 +1,409 @@ +""" +A cached libSparse linear solver for repeated solves against the same +symmetric sparse matrix structure. `numeric_refactor!` and `solve!` allocate +nothing once the cache is built. + +Float64 only. The factor type defaults to `SparseFactorizationLDLT` (pivoted +Bunch-Kaufman), which matches what the old `AppleAccelerateExt.jl` used. + +`reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis; +`check_pattern` adds a structural-equality check on refactor calls and is +only consulted when reusing. `check_symmetry` enforces structural symmetry of +the input pattern on the analysis path (constructor and `symbolic_factor!`); +disable only when the caller can guarantee symmetry by construction. +""" +mutable struct AAFactorCache + # Apple-side 0-based, narrower-integer copies of the input CSC pattern. + # Strip to lower triangle once during `symbolic_factor!`; subsequent + # `numeric_refactor!` calls reuse these arrays as-is. + columnStarts::Vector{Clong} + rowIndices::Vector{Cint} + nzval::Vector{Cdouble} + n::Int + nnz_tri::Int + factorization_type::SparseFactorization_t + symbolic::SparseOpaqueSymbolicFactorization + numeric::SparseOpaqueFactorization_t + reuse_symbolic::Bool + check_pattern::Bool + check_symmetry::Bool + # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call. + scratch::Matrix{Cdouble} + col_map::Vector{Int} +end + +@inline _dim(cache::AAFactorCache) = cache.n + +Base.size(cache::AAFactorCache) = (cache.n, cache.n) +Base.size(cache::AAFactorCache, d::Integer) = d <= 2 ? cache.n : 1 +Base.eltype(::Type{AAFactorCache}) = Cdouble + +""" + is_factored(cache::AAFactorCache) -> Bool + +`true` when `cache` holds a valid numeric factorization ready for `solve!` / +`solve_sparse!`. `false` after construction (before `full_factor!`) or after +the libSparse handles have been finalized. +""" +function is_factored(cache::AAFactorCache) + return cache.numeric.status == SparseStatusOk && cache.symbolic.status == SparseStatusOk +end + +""" + AAFactorCache(A; reuse_symbolic=true, check_pattern=true, + check_symmetry=true, + factorization_type=SparseFactorizationLDLT) + +Build a cache for the symmetric sparse matrix `A`. Allocates the Apple-side +structural arrays (`columnStarts`, `rowIndices`, `nzval`) sized to the lower +triangle of `A` but does **not** factorize. Call `full_factor!` (or +`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`. + +A finalizer frees libSparse handles on GC; call `Base.finalize(cache)` to +release them eagerly. + +`check_symmetry=true` (default) verifies that `A`'s nonzero pattern is +structurally symmetric on construction and on each `symbolic_factor!`. Pass +`check_symmetry=false` only when the caller can guarantee symmetry by +construction (e.g. `ABA = Aᵀ B A`). +""" +function AAFactorCache( + A::SparseMatrixCSC{Float64, Int}; + reuse_symbolic::Bool = true, + check_pattern::Bool = true, + check_symmetry::Bool = true, + factorization_type::SparseFactorization_t = SparseFactorizationLDLT, +) + n = size(A, 1) + n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))")) + if check_symmetry + _assert_structural_symmetry(A) + end + + nnz_tri = _count_lower_triangle(A) + cache = AAFactorCache( + Vector{Clong}(undef, n + 1), + Vector{Cint}(undef, nnz_tri), + Vector{Cdouble}(undef, 0), + n, + nnz_tri, + factorization_type, + _null_symbolic(), + _null_factorization(), + reuse_symbolic, + check_pattern, + check_symmetry, + Matrix{Cdouble}(undef, 0, 0), + Int[], + ) + _populate_lower_triangle_pattern!(cache, A) + finalizer(_free_handles!, cache) + return cache +end + +# Structural-symmetry check: for every nonzero at (i, j) with i != j, require +# a mirror nonzero at (j, i). Throws on first violation. O(nnz · avg-col-len). +function _assert_structural_symmetry(A::SparseMatrixCSC{Float64, Int}) + n = size(A, 1) + rowval = rowvals(A) + @inbounds for j in 1:n + for p in nzrange(A, j) + i = rowval[p] + i == j && continue + mirror = false + for q in nzrange(A, i) + if rowval[q] == j + mirror = true + break + end + end + if !mirror + throw( + ArgumentError( + "AAFactorCache: input matrix is not structurally symmetric " * + "(nonzero at ($i, $j) has no mirror at ($j, $i)). " * + "Pass `check_symmetry=false` to bypass if the caller " * + "guarantees symmetry by construction.", + ), + ) + end + end + end + return nothing +end + +# Count nonzeros in the lower triangle (i >= j) of A. +function _count_lower_triangle(A::SparseMatrixCSC{Float64, Int}) + cnt = 0 + rowval = rowvals(A) + @inbounds for j in 1:size(A, 2) + for p in nzrange(A, j) + rowval[p] >= j && (cnt += 1) + end + end + return cnt +end + +# Strip A to its lower triangle and fill `cache.columnStarts` / `cache.rowIndices` +# with 0-based indices. Caller must have sized these to (n+1) and nnz_tri. +function _populate_lower_triangle_pattern!( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, +) + rowval = rowvals(A) + n = cache.n + pos = 0 + cache.columnStarts[1] = 0 + @inbounds for j in 1:n + for p in nzrange(A, j) + if rowval[p] >= j + pos += 1 + cache.rowIndices[pos] = Cint(rowval[p] - 1) + end + end + cache.columnStarts[j + 1] = Clong(pos) + end + return cache +end + +# Snapshot the lower-triangle values into `cache.nzval`, growing if needed. +function _populate_lower_triangle_values!( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, +) + if length(cache.nzval) != cache.nnz_tri + resize!(cache.nzval, cache.nnz_tri) + end + rowval = rowvals(A) + nzv = nonzeros(A) + pos = 0 + @inbounds for j in 1:size(A, 2) + for p in nzrange(A, j) + if rowval[p] >= j + pos += 1 + cache.nzval[pos] = nzv[p] + end + end + end + return cache +end + +# Pattern-match guard for `numeric_refactor!`: assert the incoming CSC's +# lower-triangle pattern is identical to what was analyzed. Same shape as +# KLU's `_check_pattern_match`, just without the off-by-one increment dance +# (Apple's arrays are 0-based but we always store them 0-based — no flipping). +function _check_pattern_match( + cache::AAFactorCache, + A::SparseMatrixCSC{Float64, Int}, + op::AbstractString, +) + n = cache.n + if size(A, 1) != n || size(A, 2) != n + throw(DimensionMismatch("Cannot $op: cache is $(n)×$(n) but A is $(size(A)).")) + end + rowval = rowvals(A) + pos = 0 + @inbounds for j in 1:n + cache.columnStarts[j] == Clong(pos) || return _pattern_mismatch(op) + for p in nzrange(A, j) + if rowval[p] >= j + pos += 1 + pos > cache.nnz_tri && return _pattern_mismatch(op) + cache.rowIndices[pos] == Cint(rowval[p] - 1) || return _pattern_mismatch(op) + end + end + cache.columnStarts[j + 1] == Clong(pos) || return _pattern_mismatch(op) + end + pos == cache.nnz_tri || return _pattern_mismatch(op) + return nothing +end + +_pattern_mismatch(op::AbstractString) = + throw(ArgumentError("Cannot $op: matrix has different sparsity structure.")) + +""" +Release the libSparse numeric and symbolic handles held by `cache`, leaving +Julia-side state intact. Idempotent. +""" +function _free_handles!(cache::AAFactorCache) + if cache.numeric.status == SparseStatusOk + _sparse_cleanup_factor!(cache.numeric) + cache.numeric = _null_factorization() + end + if cache.symbolic.status == SparseStatusOk + _sparse_cleanup_symbolic!(cache.symbolic) + cache.symbolic = _null_symbolic() + end + return nothing +end + +Base.finalize(cache::AAFactorCache) = _free_handles!(cache) + +# Build the libSparse `SparseMatrixStructure` view that points into the +# cache's owned arrays. Marked symmetric + lower triangle so the +# `SparseFactorizationLDLT` path treats `cache.nzval` as the lower-triangle +# entries of a symmetric matrix. +function _structure_view(cache::AAFactorCache) + return SparseMatrixStructure( + Cint(cache.n), + Cint(cache.n), + pointer(cache.columnStarts), + pointer(cache.rowIndices), + ATT_SYMMETRIC | ATT_LOWER_TRIANGLE, + UInt8(1), + ) +end + +function _matrix_view(cache::AAFactorCache) + return SparseMatrix_t(_structure_view(cache), pointer(cache.nzval)) +end + +""" + symbolic_factor!(cache, A) + +Free any cached symbolic/numeric factor, replace the structural arrays with +`A`'s lower-triangle pattern, and analyze. Subsequent `numeric_refactor!` +calls reuse the analysis. +""" +function symbolic_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + n = cache.n + if size(A, 1) != n || size(A, 2) != n + throw(DimensionMismatch("Cannot factor: cache is $(n)×$(n) but A is $(size(A)).")) + end + if cache.check_symmetry + _assert_structural_symmetry(A) + end + _free_handles!(cache) + new_nnz_tri = _count_lower_triangle(A) + if new_nnz_tri != cache.nnz_tri + resize!(cache.rowIndices, new_nnz_tri) + cache.nnz_tri = new_nnz_tri + end + if length(cache.columnStarts) != n + 1 + resize!(cache.columnStarts, n + 1) + end + _populate_lower_triangle_pattern!(cache, A) + sym = _sparse_symbolic_factor( + cache.factorization_type, + _structure_view(cache), + SparseSymbolicFactorOptions(), + ) + if sym.status != SparseStatusOk + # libSparse may have allocated C-side state before deciding to fail — + # release it before throwing so we don't leak per failed factor. + _sparse_cleanup_symbolic!(sym) + _libsparse_throw(sym.status, "symbolic factor") + end + cache.symbolic = sym + return cache +end + +""" + numeric_refactor!(cache, A) + +Refresh the numeric factor on top of the existing symbolic analysis. Errors +if `symbolic_factor!` has not been called yet. +""" +function numeric_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + cache.symbolic.status == SparseStatusOk || + error("AAFactorCache: call symbolic_factor! before numeric_refactor!.") + cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor") + _populate_lower_triangle_values!(cache, A) + if cache.numeric.status == SparseStatusOk + _sparse_cleanup_factor!(cache.numeric) + cache.numeric = _null_factorization() + end + num = _sparse_numeric_factor( + cache.symbolic, + _matrix_view(cache), + SparseNumericFactorOptions(), + ) + if num.status != SparseStatusOk + # Same rationale as in symbolic_factor!: release before throwing. + _sparse_cleanup_factor!(num) + _libsparse_throw(num.status, "numeric factor") + end + cache.numeric = num + return cache +end + +""" + symbolic_refactor!(cache, A) + +If `cache.reuse_symbolic`, optionally verify the structure matches and reuse +the existing analysis. Otherwise, rerun `symbolic_factor!`. +""" +function symbolic_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + if !cache.reuse_symbolic + return symbolic_factor!(cache, A) + end + cache.check_pattern && _check_pattern_match(cache, A, "symbolic_refactor") + return cache +end + +""" + full_factor!(cache, A) -> cache + +Run a fresh symbolic analysis followed by a numeric factorization on `A`. +""" +function full_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + symbolic_factor!(cache, A) + numeric_refactor!(cache, A) + return cache +end + +""" + full_refactor!(cache, A) -> cache + +Refresh both factorizations on `A`. Defers to `symbolic_refactor!` (which +reuses the existing analysis when `cache.reuse_symbolic` is set) followed by +`numeric_refactor!`. +""" +function full_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int}) + symbolic_refactor!(cache, A) + numeric_refactor!(cache, A) + return cache +end + +""" + aa_factorize(A; reuse_symbolic=true, check_pattern=true, + check_symmetry=true, + factorization_type=SparseFactorizationLDLT) -> AAFactorCache + +Build a cache for `A` and immediately compute the full factorization. See +`AAFactorCache` for the kwarg semantics, including `check_symmetry`. +""" +function aa_factorize( + A::SparseMatrixCSC{Float64, Int}; + reuse_symbolic::Bool = true, + check_pattern::Bool = true, + check_symmetry::Bool = true, + factorization_type::SparseFactorization_t = SparseFactorizationLDLT, +) + cache = AAFactorCache( + A; + reuse_symbolic = reuse_symbolic, + check_pattern = check_pattern, + check_symmetry = check_symmetry, + factorization_type = factorization_type, + ) + return full_factor!(cache, A) +end + +""" + _ensure_scratch!(cache, block) -> Nothing + +Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length +`block`. Used by `solve_sparse!`. +""" +@inline function _ensure_scratch!(cache::AAFactorCache, block::Int) + n = cache.n + s = cache.scratch + if size(s, 1) != n || size(s, 2) < block + cache.scratch = Matrix{Cdouble}(undef, n, block) + end + if length(cache.col_map) < block + resize!(cache.col_map, block) + end + return nothing +end diff --git a/src/AccelerateWrapper/libsparse_bindings.jl b/src/AccelerateWrapper/libsparse_bindings.jl new file mode 100644 index 000000000..1c85491e4 --- /dev/null +++ b/src/AccelerateWrapper/libsparse_bindings.jl @@ -0,0 +1,320 @@ +# Direct bindings into Apple's libSparse.dylib (part of the Accelerate +# framework). Only the entry points actually consumed by PowerNetworkMatrices +# are wrapped — Float64-only, no Float32 mangled aliases, no QR / Cholesky-AtA +# variants. Mangled names match what AppleAccelerate.jl uses; see +# `/Library/Developer/CommandLineTools/SDKs/MacOSX*.sdk/System/Library/ +# Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/ +# Versions/A/Headers/Sparse/Solve.h` for the C declarations. + +const LIBSPARSE = + "/System/Library/Frameworks/Accelerate.framework/Versions/A/" * + "Frameworks/vecLib.framework/libSparse.dylib" + +@enum SparseFactorization_t::UInt8 begin + SparseFactorizationCholesky = 0 + SparseFactorizationLDLT = 1 + SparseFactorizationLDLTUnpivoted = 2 + SparseFactorizationLDLTSBK = 3 + SparseFactorizationLDLTTPP = 4 +end + +@enum SparseOrder_t::UInt8 begin + SparseOrderDefault = 0 + SparseOrderUser = 1 + SparseOrderAMD = 2 + SparseOrderMetis = 3 + SparseOrderCOLAMD = 4 +end + +@enum SparseScaling_t::UInt8 begin + SparseScalingDefault = 0 + SparseScalingUser = 1 + SparseScalingEquilibriationInf = 2 +end + +@enum SparseStatus_t::Int32 begin + SparseStatusOk = 0 + SparseStatusFailed = -1 + SparseMatrixIsSingular = -2 + SparseInternalError = -3 + SparseParameterError = -4 + SparseStatusReleased = -2147483647 +end + +@enum SparseControl_t::UInt32 begin + SparseDefaultControl = 0 +end + +# `SparseAttributes_t` is a packed bitfield in C; Julia can't express that +# cleanly, so we model it as `Cuint` and assemble the bits ourselves. +const att_type = Cuint +const ATT_TRANSPOSE = att_type(1) +const ATT_UPPER_TRIANGLE = att_type(0) +const ATT_LOWER_TRIANGLE = att_type(2) +const ATT_ORDINARY = att_type(0) +const ATT_TRIANGULAR = att_type(4) +const ATT_UNIT_TRIANGULAR = att_type(8) +const ATT_SYMMETRIC = att_type(12) + +struct SparseMatrixStructure + rowCount::Cint + columnCount::Cint + columnStarts::Ptr{Clong} + rowIndices::Ptr{Cint} + attributes::att_type + blockSize::UInt8 +end + +struct SparseNumericFactorOptions + control::SparseControl_t + scalingMethod::SparseScaling_t + scaling::Ptr{Cvoid} + pivotTolerance::Float64 + zeroTolerance::Float64 +end + +# Defaults match Apple's SolveImplementation.h. +SparseNumericFactorOptions() = SparseNumericFactorOptions( + SparseDefaultControl, + SparseScalingDefault, + C_NULL, + 0.01, + eps(Cdouble) * 1e-4, +) + +struct SparseSymbolicFactorOptions + control::SparseControl_t + orderMethod::SparseOrder_t + order::Ptr{Cvoid} + ignoreRowsAndColumns::Ptr{Cvoid} + malloc::Ptr{Cvoid} + free::Ptr{Cvoid} + reportError::Ptr{Cvoid} +end + +# `reportError` is fired by libSparse before it returns a failure status. The +# inline `error(unsafe_string(text))` matches AppleAccelerate.jl's pattern: it +# propagates the libSparse message as a Julia exception that unwinds back +# through the originating ccall. Avoids `@error`'s allocator/logger overhead, +# which is fragile when libSparse invokes the callback from its own threads. +# Passing libc malloc/free explicitly (the C_NULL "use Apple defaults" path is +# documented but unreliable from a non-Obj-C caller). +function SparseSymbolicFactorOptions() + return SparseSymbolicFactorOptions( + SparseDefaultControl, + SparseOrderDefault, + C_NULL, + C_NULL, + @cfunction(Libc.malloc, Ptr{Cvoid}, (Csize_t,)), + @cfunction(Libc.free, Cvoid, (Ptr{Cvoid},)), + @cfunction(text -> error(unsafe_string(text)), Cvoid, (Cstring,)), + ) +end + +struct DenseVector_t + count::Cint + data::Ptr{Cdouble} +end + +struct DenseMatrix_t + rowCount::Cint + columnCount::Cint + columnStride::Cint + attributes::att_type + data::Ptr{Cdouble} +end + +struct SparseMatrix_t + structure::SparseMatrixStructure + data::Ptr{Cdouble} +end + +struct SparseOpaqueSymbolicFactorization + status::SparseStatus_t + rowCount::Cint + columnCount::Cint + attributes::att_type + blockSize::UInt8 + type::SparseFactorization_t + factorization::Ptr{Cvoid} + workspaceSize_Float::Csize_t + workspaceSize_Double::Csize_t + factorSize_Float::Csize_t + factorSize_Double::Csize_t +end + +# Placeholder used to initialize `AAFactorCache.symbolic` before the first +# factor. Marked released so any accidental cleanup is a no-op. +function _null_symbolic() + return SparseOpaqueSymbolicFactorization( + SparseStatusReleased, + 0, + 0, + ATT_ORDINARY, + 0, + SparseFactorizationLDLT, + C_NULL, + 0, + 0, + 0, + 0, + ) +end + +struct SparseOpaqueFactorization_t + status::SparseStatus_t + attributes::att_type + symbolicFactorization::SparseOpaqueSymbolicFactorization + userFactorStorage::Bool + numericFactorization::Ptr{Cvoid} + solveWorkspaceRequiredStatic::Csize_t + solveWorkspaceRequiredPerRHS::Csize_t +end + +function _null_factorization() + return SparseOpaqueFactorization_t( + SparseStatusReleased, + ATT_ORDINARY, + _null_symbolic(), + false, + C_NULL, + 0, + 0, + ) +end + +# Build the Apple-side dense views at the ccall boundary. `StridedMatrix`'s +# first-dimension stride must be 1 (the libSparse contract); we assert at the +# call site, not here. +function _dense_matrix(B::StridedMatrix{Cdouble}) + return DenseMatrix_t( + Cint(size(B, 1)), + Cint(size(B, 2)), + Cint(stride(B, 2)), + ATT_ORDINARY, + pointer(B), + ) +end + +function _dense_vector(b::StridedVector{Cdouble}) + return DenseVector_t(Cint(length(b)), pointer(b)) +end + +# --- ccalls ----------------------------------------------------------------- +# +# Mangled symbol names come from the C++ ABI of libSparse. They are stable on +# the system framework and match what AppleAccelerate.jl binds. If Apple +# breaks them in a future macOS release, the failure will be loud (dlopen of +# a missing symbol at first call), which is the behavior we want. + +# Symbolic-only factor: analyzes the pattern, returns an opaque symbolic +# factor. Can back many numeric factors on the same pattern. +function _sparse_symbolic_factor( + ftype::SparseFactorization_t, + structure::SparseMatrixStructure, + sym_opts::SparseSymbolicFactorOptions, +)::SparseOpaqueSymbolicFactorization + return @ccall LIBSPARSE._Z12SparseFactorh21SparseMatrixStructure27SparseSymbolicFactorOptions( + ftype::Cuint, + structure::SparseMatrixStructure, + sym_opts::SparseSymbolicFactorOptions, + )::SparseOpaqueSymbolicFactorization +end + +# Numeric factor on top of an existing symbolic factor. Reusable: the +# symbolic handle is not consumed. +function _sparse_numeric_factor( + symbolic::SparseOpaqueSymbolicFactorization, + matrix::SparseMatrix_t, + num_opts::SparseNumericFactorOptions, +)::SparseOpaqueFactorization_t + return @ccall LIBSPARSE._Z12SparseFactor33SparseOpaqueSymbolicFactorization19SparseMatrix_Double26SparseNumericFactorOptions( + symbolic::SparseOpaqueSymbolicFactorization, + matrix::SparseMatrix_t, + num_opts::SparseNumericFactorOptions, + )::SparseOpaqueFactorization_t +end + +# In-place solve `A · X = B`, where B is a column-major dense matrix and X +# overwrites B's storage. +function _sparse_solve_matrix!( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_Double( + factor::SparseOpaqueFactorization_t, + B::DenseMatrix_t, + )::Cvoid + return nothing +end + +function _sparse_solve_vector!( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, +) + @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_Double( + factor::SparseOpaqueFactorization_t, + b::DenseVector_t, + )::Cvoid + return nothing +end + +# `Y = A · X`, dense multi-column. `Y` must be allocated to (rowCount, ncols) +# by the caller. libSparse overwrites — does not accumulate. +function _sparse_multiply_matrix!( + A::SparseMatrix_t, + X::DenseMatrix_t, + Y::DenseMatrix_t, +) + @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseMatrix_DoubleS0_( + A::SparseMatrix_t, + X::DenseMatrix_t, + Y::DenseMatrix_t, + )::Cvoid + return nothing +end + +function _sparse_multiply_vector!( + A::SparseMatrix_t, + x::DenseVector_t, + y::DenseVector_t, +) + @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseVector_DoubleS0_( + A::SparseMatrix_t, + x::DenseVector_t, + y::DenseVector_t, + )::Cvoid + return nothing +end + +# Frees the libSparse-side numeric / symbolic storage attached to an opaque +# factor. Idempotent: a second call with a `SparseStatusReleased` handle is +# a no-op on libSparse's side. +function _sparse_cleanup_factor!(factor::SparseOpaqueFactorization_t) + @ccall LIBSPARSE._Z13SparseCleanup32SparseOpaqueFactorization_Double( + factor::SparseOpaqueFactorization_t, + )::Cvoid + return nothing +end + +function _sparse_cleanup_symbolic!(symbolic::SparseOpaqueSymbolicFactorization) + @ccall LIBSPARSE._Z13SparseCleanup33SparseOpaqueSymbolicFactorization( + symbolic::SparseOpaqueSymbolicFactorization, + )::Cvoid + return nothing +end + +# Translate libSparse status codes into Julia exceptions. Singular and +# parameter-error are the most common; the rest fall through to a generic +# `error`. +function _libsparse_throw(status::SparseStatus_t, op::AbstractString) + status == SparseMatrixIsSingular && + throw(LinearAlgebra.SingularException(0)) + status == SparseParameterError && + throw(ArgumentError("libSparse $(op) failed: parameter error")) + status == SparseInternalError && + error("libSparse $(op) failed: internal error") + status == SparseStatusFailed && + error("libSparse $(op) failed") + return error("libSparse $(op) failed: status=$(Int(status))") +end diff --git a/src/AccelerateWrapper/solve_dense.jl b/src/AccelerateWrapper/solve_dense.jl new file mode 100644 index 000000000..dbf09d3a4 --- /dev/null +++ b/src/AccelerateWrapper/solve_dense.jl @@ -0,0 +1,40 @@ +""" + solve!(cache, B) -> B + +Solve `A · X = B` in place, dispatching on the shape of `B`: + + - `B::StridedMatrix{Cdouble}`: multiple right-hand sides handled in a single + libSparse call; `size(B, 1)` must equal `cache.n`. + - `B::StridedVector{Cdouble}`: single right-hand side; `length(B)` must + equal `cache.n`. + +Both overloads require `B` to have unit stride in the first dimension and +the cache to be factored (`is_factored(cache) == true`). +""" +function solve!(cache::AAFactorCache, B::StridedMatrix{Cdouble}) + is_factored(cache) || error("AAFactorCache: not factored yet.") + n = cache.n + size(B, 1) == n || + throw(DimensionMismatch("size(B, 1) = $(size(B, 1)), cache n = $(n)")) + stride(B, 1) == 1 || + throw(ArgumentError("B must have unit stride in the first dimension.")) + size(B, 2) == 0 && return B + _sparse_solve_matrix!(cache.numeric, _dense_matrix(B)) + return B +end + +function solve!(cache::AAFactorCache, b::StridedVector{Cdouble}) + is_factored(cache) || error("AAFactorCache: not factored yet.") + n = cache.n + length(b) == n || throw(DimensionMismatch("length(b) = $(length(b)), cache n = $(n)")) + stride(b, 1) == 1 || throw(ArgumentError("b must have unit stride.")) + _sparse_solve_vector!(cache.numeric, _dense_vector(b)) + return b +end + +""" + \\(cache::AAFactorCache, B) -> X + +Allocating solve, mirroring `LinearAlgebra.Factorization`'s API. +""" +Base.:\(cache::AAFactorCache, B::StridedVecOrMat{Cdouble}) = solve!(cache, copy(B)) diff --git a/src/AccelerateWrapper/solve_sparse_rhs.jl b/src/AccelerateWrapper/solve_sparse_rhs.jl new file mode 100644 index 000000000..61edf9772 --- /dev/null +++ b/src/AccelerateWrapper/solve_sparse_rhs.jl @@ -0,0 +1,97 @@ +const SPARSE_RHS_DEFAULT_BLOCK = 64 + +""" + solve_sparse!(cache, B, out; block=$(SPARSE_RHS_DEFAULT_BLOCK)) -> out + +Solve `A · X = B` for a `SparseMatrixCSC` right-hand side, writing the +result into `out`. Empty columns of `B` are not solved — `out`'s +corresponding columns are zeroed. Non-empty columns within each chunk of +`block` consecutive RHS columns are packed into a dense scratch and solved +in a single libSparse call. + +The `block` chunk size bounds the working set so that processing an +`n × nrhs` sparse RHS requires only `O(n · block)` extra memory regardless +of `nrhs`. The cache reuses its packing buffer across calls; warm calls +allocate nothing in the solver. + +Not thread-safe (mutates per-cache scratch). +""" +function solve_sparse!( + cache::AAFactorCache, + B::SparseMatrixCSC{<:Number, Int}, + out::AbstractMatrix{Cdouble}; + block::Int = SPARSE_RHS_DEFAULT_BLOCK, +) + is_factored(cache) || error("AAFactorCache: not factored yet.") + block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)")) + n = cache.n + size(B, 1) == n || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(n)", + )) + size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch( + "out has size $(size(out)); expected $((n, size(B, 2))).", + )) + + nb = size(B, 2) + nb == 0 && return out + fill!(out, zero(Cdouble)) + + Browval = rowvals(B) + Bnzval = nonzeros(B) + + _ensure_scratch!(cache, block) + scratch = cache.scratch + col_map = cache.col_map + + j_start = 1 + @inbounds while j_start <= nb + j_end = min(j_start + block - 1, nb) + + npack = 0 + for j in j_start:j_end + rng = nzrange(B, j) + isempty(rng) && continue + npack += 1 + col_map[npack] = j + fill!(view(scratch, :, npack), zero(Cdouble)) + for p in rng + scratch[Browval[p], npack] = Bnzval[p] + end + end + + if npack > 0 + # Build a DenseMatrix view over scratch[:, 1:npack]. Column stride + # is `size(scratch, 1) = n` because scratch is column-major and + # full-height. + dm = DenseMatrix_t( + Cint(n), + Cint(npack), + Cint(size(scratch, 1)), + ATT_ORDINARY, + pointer(scratch), + ) + _sparse_solve_matrix!(cache.numeric, dm) + + for k in 1:npack + copyto!(view(out, :, col_map[k]), view(scratch, :, k)) + end + end + + j_start = j_end + 1 + end + return out +end + +"""Allocating wrapper around `solve_sparse!`.""" +function solve_sparse( + cache::AAFactorCache, + B::SparseMatrixCSC{<:Number, Int}; + block::Int = SPARSE_RHS_DEFAULT_BLOCK, +) + return solve_sparse!( + cache, + B, + Matrix{Cdouble}(undef, cache.n, size(B, 2)); + block = block, + ) +end diff --git a/src/AccelerateWrapper/spmm.jl b/src/AccelerateWrapper/spmm.jl new file mode 100644 index 000000000..e1c5375ca --- /dev/null +++ b/src/AccelerateWrapper/spmm.jl @@ -0,0 +1,105 @@ +# SparseMultiply bindings. These do not require a factorization cache; they +# take a `SparseMatrixCSC{Float64, Int}` directly and build the Apple-side +# `SparseMatrix_t` view at the ccall boundary. Per call this allocates two +# transient arrays (0-based `Clong[]` colptr, narrowed `Cint[]` rowval); if +# a future hot loop needs allocation-free SpMM, lift these into a dedicated +# `AASparseView` cache type. + +""" + aa_spmm!(Y, A, X) -> Y + +Compute `Y ← A · X` in place, where `A` is a `SparseMatrixCSC{Float64, Int}`, +`X` and `Y` are `StridedMatrix{Float64}` of compatible shape. libSparse +overwrites `Y` (does not accumulate). +""" +function aa_spmm!( + Y::StridedMatrix{Cdouble}, + A::SparseMatrixCSC{Float64, Int}, + X::StridedMatrix{Cdouble}, +) + size(A, 2) == size(X, 1) || throw( + DimensionMismatch( + "A is $(size(A)), X is $(size(X)); inner dimensions must match.", + ), + ) + size(Y, 1) == size(A, 1) && size(Y, 2) == size(X, 2) || throw( + DimensionMismatch( + "Y has size $(size(Y)); expected $((size(A, 1), size(X, 2))).", + ), + ) + stride(X, 1) == 1 || throw(ArgumentError("X must have unit stride in dim 1.")) + stride(Y, 1) == 1 || throw(ArgumentError("Y must have unit stride in dim 1.")) + size(X, 2) == 0 && return Y + cs, ri = _csc_to_apple(A) + sp = SparseMatrix_t( + SparseMatrixStructure( + Cint(size(A, 1)), + Cint(size(A, 2)), + pointer(cs), + pointer(ri), + ATT_ORDINARY, + UInt8(1), + ), + pointer(nonzeros(A)), + ) + GC.@preserve cs ri A X Y _sparse_multiply_matrix!( + sp, + _dense_matrix(X), + _dense_matrix(Y), + ) + return Y +end + +""" + aa_spmv!(y, A, x) -> y + +Compute `y ← A · x` in place. +""" +function aa_spmv!( + y::StridedVector{Cdouble}, + A::SparseMatrixCSC{Float64, Int}, + x::StridedVector{Cdouble}, +) + size(A, 2) == length(x) || throw(DimensionMismatch( + "A is $(size(A)), length(x) = $(length(x)).", + )) + length(y) == size(A, 1) || throw(DimensionMismatch( + "length(y) = $(length(y)); expected $(size(A, 1)).", + )) + stride(x, 1) == 1 || throw(ArgumentError("x must have unit stride.")) + stride(y, 1) == 1 || throw(ArgumentError("y must have unit stride.")) + cs, ri = _csc_to_apple(A) + sp = SparseMatrix_t( + SparseMatrixStructure( + Cint(size(A, 1)), + Cint(size(A, 2)), + pointer(cs), + pointer(ri), + ATT_ORDINARY, + UInt8(1), + ), + pointer(nonzeros(A)), + ) + GC.@preserve cs ri A x y _sparse_multiply_vector!( + sp, + _dense_vector(x), + _dense_vector(y), + ) + return y +end + +# Convert CSC's 1-based `colptr::Vector{Int}` and `rowval::Vector{Int}` into +# Apple's 0-based `Clong[]` colstarts and narrowed `Cint[]` rowindices. +function _csc_to_apple(A::SparseMatrixCSC{Float64, Int}) + cp = getcolptr(A) + rv = rowvals(A) + cs = Vector{Clong}(undef, length(cp)) + ri = Vector{Cint}(undef, length(rv)) + @inbounds for k in eachindex(cp) + cs[k] = Clong(cp[k] - 1) + end + @inbounds for k in eachindex(rv) + ri[k] = Cint(rv[k] - 1) + end + return cs, ri +end diff --git a/src/BA_ABA_matrices.jl b/src/BA_ABA_matrices.jl index 0cec17048..7902317a3 100644 --- a/src/BA_ABA_matrices.jl +++ b/src/BA_ABA_matrices.jl @@ -179,7 +179,7 @@ power flow analysis, sensitivity calculations, and linear power system studies. struct ABA_Matrix{ Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, - F <: Union{Nothing, KLULinSolveCache{Float64}}, + F <: Union{Nothing, KLULinSolveCache{Float64, Int64}}, } <: PowerNetworkMatrix{Float64} data::SparseArrays.SparseMatrixCSC{Float64, Int} axes::Ax @@ -358,7 +358,7 @@ Check if an ABA_Matrix has been factorized (i.e., contains LU factorization matr """ is_factorized(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Dict}} = false is_factorized( - ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64}}, + ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64, Int64}}, ) where {Ax, L <: NTuple{2, Dict}} = true # get_index functions: BA_Matrix stores the transposed matrix, thus get index diff --git a/src/KLUWrapper/KLUWrapper.jl b/src/KLUWrapper/KLUWrapper.jl index 5e996dfdf..7d50b5914 100644 --- a/src/KLUWrapper/KLUWrapper.jl +++ b/src/KLUWrapper/KLUWrapper.jl @@ -74,8 +74,12 @@ export KLULinSolveCache, tsolve!, solve_sparse!, solve_sparse, + sort_factors!, + condest!, + rcond!, n_valid, - is_factored + is_factored, + get_reuse_symbolic include("klu_jll_bindings.jl") include("klu_cache.jl") diff --git a/src/KLUWrapper/klu_cache.jl b/src/KLUWrapper/klu_cache.jl index 89d519980..6318e3050 100644 --- a/src/KLUWrapper/klu_cache.jl +++ b/src/KLUWrapper/klu_cache.jl @@ -5,37 +5,55 @@ A cached KLU linear solver designed for repeated solves against the same sparse matrix structure. `numeric_refactor!` and `solve!` allocate nothing once the cache is built. -The type parameter `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU -path. Indices are always `Int64` (SuiteSparse_long). +Type parameters: +- `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU path + (`klu_*_factor`/`klu_z*_factor`). +- `Ti ∈ {Int32, Int64}` selects the index-type entry-point family + (`klu_*` for `int`/Int32, `klu_l_*` for `SuiteSparse_long`/Int64). + The cache's `colptr`/`rowval`/`col_map` are stored in this type. `reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis; `check_pattern` adds a structural-equality check on refactor calls and is only consulted when reusing. """ -mutable struct KLULinSolveCache{Tv <: Union{Float64, ComplexF64}} - colptr::Vector{Int64} - rowval::Vector{Int64} +mutable struct KLULinSolveCache{ + Tv <: Union{Float64, ComplexF64}, + Ti <: Union{Int32, Int64}, +} + colptr::Vector{Ti} + rowval::Vector{Ti} # Copy of the matrix values used in the most recent numeric factorization. # Lets `_recover_factorization!` rebuild a corrupted numeric handle without # the caller having to re-supply `A`. Empty before the first factor call. nzval::Vector{Tv} - common::Base.RefValue{KluLCommon} - symbolic::SymbolicPtr - numeric::NumericPtr + # `Base.RefValue{KluLCommon}` for Int64 or `Base.RefValue{KluCommon}` for + # Int32 — we keep the field untyped here because Julia's untyped + # `Base.RefValue` lookup is fast enough (the dispatch helpers recover the + # concrete type at each callsite via `_common_type(Ti)`) and avoiding a + # third type parameter keeps the surface tidy. + common::Base.RefValue + # Opaque Symbolic/Numeric pointers. `Ptr{Cvoid}` is safe: each callsite + # threads through the typed `SymbolicPtr` / `SymbolicPtr32` (resp. + # Numeric) at the ccall boundary; on the Julia side the values are + # treated as black boxes by every consumer. + symbolic::Ptr{Cvoid} + numeric::Ptr{Cvoid} reuse_symbolic::Bool check_pattern::Bool # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call so # the wrapper's working set stays O(n*block) instead of O(n*nrhs); see # `solve_sparse_rhs.jl`. scratch::Matrix{Tv} - col_map::Vector{Int64} + col_map::Vector{Ti} end -@inline _dim(cache::KLULinSolveCache) = Int64(length(cache.colptr) - 1) +@inline _dim(cache::KLULinSolveCache{Tv, Ti}) where {Tv, Ti} = + Ti(length(cache.colptr) - 1) -Base.size(cache::KLULinSolveCache) = (n = _dim(cache); (n, n)) -Base.size(cache::KLULinSolveCache, d::Integer) = d <= 2 ? _dim(cache) : 1 -Base.eltype(::Type{KLULinSolveCache{Tv}}) where {Tv} = Tv +Base.size(cache::KLULinSolveCache) = (n = Int(_dim(cache)); (n, n)) +Base.size(cache::KLULinSolveCache, d::Integer) = + d <= 2 ? Int(_dim(cache)) : 1 +Base.eltype(::Type{KLULinSolveCache{Tv, Ti}}) where {Tv, Ti} = Tv get_reuse_symbolic(cache::KLULinSolveCache) = cache.reuse_symbolic """ @@ -53,37 +71,363 @@ is_factored(cache::KLULinSolveCache) = # truthy form. Kept around for tests that want a uniform numeric reading. n_valid(cache::KLULinSolveCache) = is_factored(cache) ? 1 : 0 -@inline _factor_call(::Type{Float64}, ap, ai, ax, sym, common) = - klu_l_factor(ap, ai, ax, sym, common) -@inline _factor_call(::Type{ComplexF64}, ap, ai, ax, sym, common) = - klu_zl_factor(ap, ai, ax, sym, common) +# --------------------------------------------------------------------------- +# Type-paired dispatch helpers — (Tv, Ti) → libklu entry point +# --------------------------------------------------------------------------- + +# Map Ti to its concrete `klu_common` struct type. +@inline _common_type(::Type{Int32}) = KluCommon +@inline _common_type(::Type{Int64}) = KluLCommon + +# `klu_defaults` initializer. +@inline _defaults!(::Type{Int32}, common::Ref) = klu_defaults!(common) +@inline _defaults!(::Type{Int64}, common::Ref) = klu_l_defaults!(common) + +# `klu_analyze` returns the symbolic handle. n is widened to `Ti` so the +# C argument width matches. +@inline _analyze_call(::Type{Int32}, n, ap, ai, common) = + klu_analyze(Cint(n), ap, ai, common) +@inline _analyze_call(::Type{Int64}, n, ap, ai, common) = + klu_l_analyze(Int64(n), ap, ai, common) + +# `klu_free_symbolic` — takes the opaque pointer ref and the common ref. +# `sym_ref` is a `Ref{Ptr{Cvoid}}` on the Julia side; we reinterpret it +# to the typed `SymbolicPtr` / `SymbolicPtr32` at the ccall site so libklu +# sees the right pointer width. +@inline function _free_symbolic!(::Type{Int32}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref) + typed = Ref(reinterpret(SymbolicPtr32, sym_ref[])) + klu_free_symbolic!(typed, common) + sym_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_symbolic!(::Type{Int64}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref) + typed = Ref(reinterpret(SymbolicPtr, sym_ref[])) + klu_l_free_symbolic!(typed, common) + sym_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end -@inline _refactor_call(::Type{Float64}, ap, ai, ax, sym, num, common) = - klu_l_refactor(ap, ai, ax, sym, num, common) -@inline _refactor_call(::Type{ComplexF64}, ap, ai, ax, sym, num, common) = - klu_zl_refactor(ap, ai, ax, sym, num, common) +# `klu_factor` — returns the numeric handle as a typed pointer; we +# reinterpret to `Ptr{Cvoid}` for storage. Dispatch on both Tv and Ti. +@inline function _factor_call(::Type{Float64}, ::Type{Int32}, ap, ai, ax, sym, common) + return reinterpret( + Ptr{Cvoid}, + klu_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common), + ) +end +@inline function _factor_call(::Type{Float64}, ::Type{Int64}, ap, ai, ax, sym, common) + return reinterpret( + Ptr{Cvoid}, + klu_l_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common), + ) +end +@inline function _factor_call( + ::Type{ComplexF64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + common, +) + return reinterpret( + Ptr{Cvoid}, + klu_z_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common), + ) +end +@inline function _factor_call( + ::Type{ComplexF64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + common, +) + return reinterpret( + Ptr{Cvoid}, + klu_zl_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common), + ) +end -@inline _solve_call(::Type{Float64}, sym, num, n, nrhs, b, common) = - klu_l_solve(sym, num, n, nrhs, b, common) -@inline _solve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common) = - klu_zl_solve(sym, num, n, nrhs, b, common) +@inline function _refactor_call( + ::Type{Float64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + common, + ) +end +@inline function _refactor_call( + ::Type{Float64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_l_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + common, + ) +end +@inline function _refactor_call( + ::Type{ComplexF64}, + ::Type{Int32}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_z_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + common, + ) +end +@inline function _refactor_call( + ::Type{ComplexF64}, + ::Type{Int64}, + ap, + ai, + ax, + sym, + num, + common, +) + return klu_zl_refactor( + ap, ai, ax, + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + common, + ) +end -@inline _tsolve_call(::Type{Float64}, sym, num, n, nrhs, b, common; conjugate = false) = - klu_l_tsolve(sym, num, n, nrhs, b, common) -@inline _tsolve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common; conjugate = false) = - klu_zl_tsolve(sym, num, n, nrhs, b, Cint(conjugate), common) +@inline function _solve_call( + ::Type{Float64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_solve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{Float64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_l_solve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{ComplexF64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_z_solve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _solve_call( + ::Type{ComplexF64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common, +) + return klu_zl_solve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end -@inline _free_numeric!(::Type{Float64}, num_ref, common) = - klu_l_free_numeric!(num_ref, common) -@inline _free_numeric!(::Type{ComplexF64}, num_ref, common) = - klu_zl_free_numeric!(num_ref, common) +@inline function _tsolve_call( + ::Type{Float64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_tsolve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, common, + ) +end +@inline function _tsolve_call( + ::Type{Float64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_l_tsolve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, common, + ) +end +@inline function _tsolve_call( + ::Type{ComplexF64}, + ::Type{Int32}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_z_tsolve( + reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), + Cint(n), Cint(nrhs), b, Cint(conjugate), common, + ) +end +@inline function _tsolve_call( + ::Type{ComplexF64}, + ::Type{Int64}, + sym, + num, + n, + nrhs, + b, + common; + conjugate::Bool = false, +) + return klu_zl_tsolve( + reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), + Int64(n), Int64(nrhs), b, Cint(conjugate), common, + ) +end + +@inline function _free_numeric!( + ::Type{Float64}, + ::Type{Int32}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr32, num_ref[])) + klu_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{Float64}, + ::Type{Int64}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr, num_ref[])) + klu_l_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{ComplexF64}, + ::Type{Int32}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr32, num_ref[])) + klu_z_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end +@inline function _free_numeric!( + ::Type{ComplexF64}, + ::Type{Int64}, + num_ref::Ref{Ptr{Cvoid}}, + common::Ref, +) + typed = Ref(reinterpret(NumericPtr, num_ref[])) + klu_zl_free_numeric!(typed, common) + num_ref[] = reinterpret(Ptr{Cvoid}, typed[]) + return nothing +end + +# Performance-knob dispatchers — Float64 only. libklu exposes these in the +# ComplexF64 build too; we'll add bindings if a consumer asks. +@inline _sort_call(::Type{Int32}, sym, num, common) = + klu_sort(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common) +@inline _sort_call(::Type{Int64}, sym, num, common) = + klu_l_sort(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common) + +@inline _condest_call(::Type{Int32}, ap, ax, sym, num, common) = + klu_condest( + ap, + ax, + reinterpret(SymbolicPtr32, sym), + reinterpret(NumericPtr32, num), + common, + ) +@inline _condest_call(::Type{Int64}, ap, ax, sym, num, common) = + klu_l_condest( + ap, + ax, + reinterpret(SymbolicPtr, sym), + reinterpret(NumericPtr, num), + common, + ) + +@inline _rcond_call(::Type{Int32}, sym, num, common) = + klu_rcond(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common) +@inline _rcond_call(::Type{Int64}, sym, num, common) = + klu_l_rcond(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common) + +# --------------------------------------------------------------------------- +# Constructor +# --------------------------------------------------------------------------- """ KLULinSolveCache(A; reuse_symbolic=true, check_pattern=true) -Build a cache for the sparse matrix `A`. Allocates structural arrays and -runs `klu_l_defaults`, but does **not** factorize. Call `full_factor!` -(or `symbolic_factor!` followed by `numeric_refactor!`) before `solve!`. +Build a cache for the sparse matrix `A`. The cache's index type is taken +from `A`: `SparseMatrixCSC{Tv, Int32}` ⇒ `KLULinSolveCache{Tv, Int32}`, +`SparseMatrixCSC{Tv, Int64}` ⇒ `KLULinSolveCache{Tv, Int64}`. Allocates +structural arrays and runs the corresponding `klu_defaults`/`klu_l_defaults` +initializer, but does **not** factorize. Call `full_factor!` (or +`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`. A finalizer frees libklu handles on GC; call `Base.finalize(cache)` to release them eagerly. Releasing the handles leaves Julia-side state intact, @@ -91,34 +435,30 @@ so the cache can be re-factorized via `symbolic_factor!`/`numeric_refactor!` or `full_factor!`. """ function KLULinSolveCache( - A::SparseMatrixCSC{Tv, Int}; + A::SparseMatrixCSC{Tv, Ti}; reuse_symbolic::Bool = true, check_pattern::Bool = true, -) where {Tv <: Union{Float64, ComplexF64}} - Int === Int64 || error( - "KLULinSolveCache requires a 64-bit Julia build (SuiteSparse_long " * - "bindings need Int64 indices). Got Int = $(Int).", - ) +) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}} n = size(A, 1) - n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))")) + n == size(A, 2) || + throw(DimensionMismatch("matrix must be square; got $(size(A))")) - common = Ref(KluLCommon()) - klu_l_defaults!(common) + common = Ref(_common_type(Ti)()) + _defaults!(Ti, common) - colptr = Vector{Int64}(undef, length(getcolptr(A))) + colptr = Vector{Ti}(undef, length(getcolptr(A))) copyto!(colptr, getcolptr(A)) - colptr .-= 1 - rowval = Vector{Int64}(undef, length(rowvals(A))) + colptr .-= one(Ti) + rowval = Vector{Ti}(undef, length(rowvals(A))) copyto!(rowval, rowvals(A)) - rowval .-= 1 + rowval .-= one(Ti) - cache = KLULinSolveCache{Tv}( + cache = KLULinSolveCache{Tv, Ti}( colptr, rowval, Tv[], common, - convert(SymbolicPtr, C_NULL), - convert(NumericPtr, C_NULL), + Ptr{Cvoid}(C_NULL), Ptr{Cvoid}(C_NULL), reuse_symbolic, check_pattern, Matrix{Tv}(undef, 0, 0), - Int64[], + Ti[], ) finalizer(_free_klu_handles!, cache) return cache @@ -131,10 +471,10 @@ Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length `block`. Grows in place; reuses across `solve_sparse!` calls. """ @inline function _ensure_scratch!( - cache::KLULinSolveCache{Tv}, + cache::KLULinSolveCache{Tv, Ti}, block::Int, -) where {Tv <: Union{Float64, ComplexF64}} - n = _dim(cache) +) where {Tv, Ti} + n = Int(_dim(cache)) s = cache.scratch if size(s, 1) != n || size(s, 2) < block cache.scratch = Matrix{Tv}(undef, n, block) @@ -153,16 +493,16 @@ second call hits the `C_NULL` guards. Used both by `symbolic_factor!` mid-life (drop old handles before re-analyzing) and by the GC finalizer. """ function _free_klu_handles!( - cache::KLULinSolveCache{Tv}, -) where {Tv <: Union{Float64, ComplexF64}} + cache::KLULinSolveCache{Tv, Ti}, +) where {Tv, Ti} if cache.numeric != C_NULL num_ref = Ref(cache.numeric) - _free_numeric!(Tv, num_ref, cache.common) + _free_numeric!(Tv, Ti, num_ref, cache.common) cache.numeric = num_ref[] end if cache.symbolic != C_NULL sym_ref = Ref(cache.symbolic) - klu_l_free_symbolic!(sym_ref, cache.common) + _free_symbolic!(Ti, sym_ref, cache.common) cache.symbolic = sym_ref[] end return nothing @@ -180,8 +520,8 @@ scope. Requires that a numeric factor has been built before (so `cache.nzval` is populated) and the symbolic factor is still valid. """ function _recover_factorization!( - cache::KLULinSolveCache{Tv}, -) where {Tv <: Union{Float64, ComplexF64}} + cache::KLULinSolveCache{Tv, Ti}, +) where {Tv, Ti} cache.symbolic == C_NULL && error( "KLULinSolveCache: cannot recover without a symbolic factor.", ) @@ -190,11 +530,12 @@ function _recover_factorization!( ) if cache.numeric != C_NULL num_ref = Ref(cache.numeric) - _free_numeric!(Tv, num_ref, cache.common) + _free_numeric!(Tv, Ti, num_ref, cache.common) cache.numeric = num_ref[] end num = _factor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(cache.nzval), cache.symbolic, cache.common, ) num == C_NULL && klu_throw(cache.common[], "klu_factor (recovery)") @@ -202,8 +543,11 @@ function _recover_factorization!( return cache end -@inline function _check_pattern_match(cache::KLULinSolveCache, - A::SparseMatrixCSC, op::AbstractString) +@inline function _check_pattern_match( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC, + op::AbstractString, +) where {Tv, Ti} Acolptr = getcolptr(A) Arowval = rowvals(A) if length(Acolptr) != length(cache.colptr) || @@ -217,13 +561,13 @@ end # Increment-compare-decrement: avoids allocating a 1-indexed copy. The # `try/finally` restores `colptr`/`rowval` even on InterruptException so # the cache is never left with off-by-one structural arrays. - cache.colptr .+= 1 - cache.rowval .+= 1 + cache.colptr .+= one(Ti) + cache.rowval .+= one(Ti) bad = try (cache.colptr != Acolptr) || (cache.rowval != Arowval) finally - cache.colptr .-= 1 - cache.rowval .-= 1 + cache.colptr .-= one(Ti) + cache.rowval .-= one(Ti) end if bad throw(ArgumentError( @@ -237,15 +581,19 @@ end symbolic_factor!(cache, A) Free any cached symbolic/numeric factor, replace the structural arrays with -`A`'s pattern, and run `klu_l_analyze`. +`A`'s pattern, and run `klu_analyze` / `klu_l_analyze`. """ -function symbolic_factor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function symbolic_factor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} n = _dim(cache) - if size(A, 1) != n || size(A, 2) != n - throw(DimensionMismatch( - "Cannot factor: cache is $(n)×$(n) but A is $(size(A)).", - )) + if size(A, 1) != Int(n) || size(A, 2) != Int(n) + throw( + DimensionMismatch( + "Cannot factor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).", + ), + ) end _free_klu_handles!(cache) @@ -253,16 +601,14 @@ function symbolic_factor!(cache::KLULinSolveCache{Tv}, Arowval = rowvals(A) resize!(cache.colptr, length(Acolptr)) copyto!(cache.colptr, Acolptr) - cache.colptr .-= 1 + cache.colptr .-= one(Ti) resize!(cache.rowval, length(Arowval)) copyto!(cache.rowval, Arowval) - cache.rowval .-= 1 + cache.rowval .-= one(Ti) - sym = klu_l_analyze( - Int64(n), pointer(cache.colptr), pointer(cache.rowval), cache.common, - ) - sym == C_NULL && klu_throw(cache.common[], "klu_l_analyze") - cache.symbolic = sym + sym = _analyze_call(Ti, n, pointer(cache.colptr), pointer(cache.rowval), cache.common) + sym == C_NULL && klu_throw(cache.common[], "klu_analyze") + cache.symbolic = reinterpret(Ptr{Cvoid}, sym) return cache end @@ -272,17 +618,19 @@ end If `cache.reuse_symbolic`, optionally verify the structure matches and reuse the existing analysis. Otherwise, rerun `symbolic_factor!`. """ -function symbolic_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function symbolic_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} if !cache.reuse_symbolic return symbolic_factor!(cache, A) end if cache.check_pattern n = _dim(cache) - if size(A, 1) != n || size(A, 2) != n + if size(A, 1) != Int(n) || size(A, 2) != Int(n) throw( DimensionMismatch( - "Cannot refactor: cache is $(n)×$(n) but A is $(size(A)).", + "Cannot refactor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).", ), ) end @@ -298,15 +646,18 @@ Compute (or refresh) the numeric factorization. The first call after `symbolic_factor!` invokes `klu_*_factor`; subsequent calls invoke `klu_*_refactor` and reuse the existing numeric struct. """ -function numeric_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function numeric_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} cache.symbolic == C_NULL && error( "KLULinSolveCache: call symbolic_factor! before numeric_refactor!.", ) Anz = nonzeros(A) if cache.numeric == C_NULL num = _factor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(Anz), cache.symbolic, cache.common, ) num == C_NULL && klu_throw(cache.common[], "klu_factor") @@ -314,7 +665,8 @@ function numeric_refactor!(cache::KLULinSolveCache{Tv}, else cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor") ok = _refactor_call( - Tv, pointer(cache.colptr), pointer(cache.rowval), + Tv, Ti, + pointer(cache.colptr), pointer(cache.rowval), pointer(Anz), cache.symbolic, cache.numeric, cache.common, ) ok != 1 && klu_throw(cache.common[], "klu_refactor") @@ -334,8 +686,10 @@ Equivalent to `symbolic_factor!(cache, A); numeric_refactor!(cache, A)`. Use this on a freshly constructed cache, or after `_free_klu_handles!` has cleared the handles, to bring the cache to a factored state. """ -function full_factor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function full_factor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} symbolic_factor!(cache, A) numeric_refactor!(cache, A) return cache @@ -351,8 +705,10 @@ the matrix values have changed; if the structure has also changed and the cache was built with `reuse_symbolic = false`, the symbolic analysis is rerun as well. """ -function full_refactor!(cache::KLULinSolveCache{Tv}, - A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}} +function full_refactor!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseMatrixCSC{Tv, Ti}, +) where {Tv, Ti} symbolic_refactor!(cache, A) numeric_refactor!(cache, A) return cache @@ -363,11 +719,94 @@ end Build a cache for `A` and immediately compute the full factorization. """ -function klu_factorize(A::SparseMatrixCSC{Tv, Int}; +function klu_factorize( + A::SparseMatrixCSC{Tv, Ti}; reuse_symbolic::Bool = true, check_pattern::Bool = true, -) where {Tv <: Union{Float64, ComplexF64}} - cache = KLULinSolveCache(A; - reuse_symbolic = reuse_symbolic, check_pattern = check_pattern) +) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}} + cache = KLULinSolveCache( + A; + reuse_symbolic = reuse_symbolic, + check_pattern = check_pattern, + ) return full_factor!(cache, A) end + +# --------------------------------------------------------------------------- +# Performance / diagnostic surface +# --------------------------------------------------------------------------- + +""" + sort_factors!(cache) -> cache + +Sort the columns of the cached L and U factors in place via libklu's +`klu_sort` / `klu_l_sort`. KLU's numeric phase stores factor columns in +arbitrary order; sorting once after the first factor improves cache locality +on every subsequent `solve!` / `tsolve!`. The cost is `O(nnz_factor)` and +is amortized over many repeated solves — a win whenever the cache is used +for ≥ a few solves on the same factorization. + +Idempotent and `refactor`-stable: sorting after the initial factor persists +through `numeric_refactor!` because refactor preserves the column layout. +Only call this if the cache will be reused for multiple solves; for a +one-shot solve it is pure overhead. + +Float64 only. +""" +function sort_factors!(cache::KLULinSolveCache{Float64, Ti}) where {Ti} + is_factored(cache) || + error("sort_factors!: cache must be factored before sorting.") + ok = _sort_call(Ti, cache.symbolic, cache.numeric, cache.common) + ok != 1 && klu_throw(cache.common[], "klu_sort") + return cache +end + +""" + condest!(cache) -> Float64 + +Compute the 1-norm condition-number estimate of the cached factorization +via libklu's `klu_condest`. The result lands in `cache.common[].condest` +and is also returned. Cost is roughly two extra solves; use sparingly. + +Useful when deciding whether iterative refinement is worth running, or for +flagging near-singular Jacobians in Newton-Raphson loops. + +Float64 only. +""" +function condest!( + cache::KLULinSolveCache{Float64, Ti}, +) where {Ti} + is_factored(cache) || + error("condest!: cache must be factored before condest.") + isempty(cache.nzval) && error( + "condest!: requires a previous numeric_refactor! to have populated nzval.", + ) + ok = _condest_call( + Ti, + pointer(cache.colptr), + pointer(cache.nzval), + cache.symbolic, + cache.numeric, + cache.common, + ) + ok != 1 && klu_throw(cache.common[], "klu_condest") + return Float64(cache.common[].condest) +end + +""" + rcond!(cache) -> Float64 + +Compute the cheap reciprocal-condition estimate +`min(|diag(U)|)/max(|diag(U)|)` via libklu's `klu_rcond`. The result lands +in `cache.common[].rcond` and is also returned. Faster than `condest!` but +less reliable as a conditioning indicator. + +Float64 only. +""" +function rcond!(cache::KLULinSolveCache{Float64, Ti}) where {Ti} + is_factored(cache) || + error("rcond!: cache must be factored before rcond.") + ok = _rcond_call(Ti, cache.symbolic, cache.numeric, cache.common) + ok != 1 && klu_throw(cache.common[], "klu_rcond") + return Float64(cache.common[].rcond) +end diff --git a/src/KLUWrapper/klu_jll_bindings.jl b/src/KLUWrapper/klu_jll_bindings.jl index ff0ed0a92..b525a6572 100644 --- a/src/KLUWrapper/klu_jll_bindings.jl +++ b/src/KLUWrapper/klu_jll_bindings.jl @@ -1,9 +1,32 @@ -# Bindings into libklu (SuiteSparse_jll), restricted to the SuiteSparse_long -# (`klu_l_*` and `klu_zl_*`) entry points used by KLULinSolveCache. +# Bindings into libklu (SuiteSparse_jll), covering both index-type +# entry-point families: +# - `klu_l_*` / `klu_zl_*` for `SuiteSparse_long` (Int64) indices +# - `klu_*` / `klu_z_*` for `int` (Int32) indices +# +# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the +# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths +# (`klu_*_free_*`), which can fire on any thread at any safepoint and would +# otherwise race against in-flight `solve!` calls on a different cache. See +# `_LIBKLU_LOCK` in `KLUWrapper.jl` for the empirical evidence (intermittent +# `KLU_INVALID` return; SEGV at `klu_solve.c:118`). import LinearAlgebra import SuiteSparse_jll: libklu +# --------------------------------------------------------------------------- +# Status codes and shared error helper +# --------------------------------------------------------------------------- + +const KLU_OK = 0 +const KLU_SINGULAR = 1 +const KLU_OUT_OF_MEMORY = -2 +const KLU_INVALID = -3 +const KLU_TOO_LARGE = -4 + +# --------------------------------------------------------------------------- +# Int64 (SuiteSparse_long) family +# --------------------------------------------------------------------------- + # Layout matches `klu_l_common` in upstream `klu.h`. Must stay in sync. mutable struct KluLCommon tol::Cdouble @@ -40,19 +63,15 @@ mutable struct KluLNumeric end const SymbolicPtr = Ptr{KluLSymbolic} const NumericPtr = Ptr{KluLNumeric} -# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the -# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths -# (`klu_l_free_*`, `klu_zl_free_*`), which can fire on any thread at any -# safepoint and would otherwise race against in-flight `solve!` calls on a -# different cache. See `_LIBKLU_LOCK` in `KLUWrapper.jl` for the -# empirical evidence (intermittent `KLU_INVALID` return; SEGV at -# `klu_solve.c:118`). - klu_l_defaults!(common::Ref{KluLCommon}) = @klu_lock ccall((:klu_l_defaults, libklu), Cint, (Ptr{KluLCommon},), common) -function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64}, - common::Ref{KluLCommon}) +function klu_l_analyze( + n::Int64, + ap::Ptr{Int64}, + ai::Ptr{Int64}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_analyze, libklu), SymbolicPtr, @@ -61,8 +80,10 @@ function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64}, ) end -function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr}, - common::Ref{KluLCommon}) +function klu_l_free_symbolic!( + symbolic_ref::Ref{SymbolicPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_free_symbolic, libklu), Cint, @@ -71,8 +92,13 @@ function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr}, ) end -function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, - symbolic::SymbolicPtr, common::Ref{KluLCommon}) +function klu_l_factor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_factor, libklu), NumericPtr, @@ -81,19 +107,37 @@ function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, ) end -function klu_l_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble}, - symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon}) +function klu_l_refactor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_refactor, libklu), Cint, - (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, - Ptr{KluLCommon}), + ( + Ptr{Int64}, + Ptr{Int64}, + Ptr{Cdouble}, + SymbolicPtr, + NumericPtr, + Ptr{KluLCommon}, + ), ap, ai, ax, symbolic, numeric, common, ) end -function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon}) +function klu_l_solve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{Cdouble}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_solve, libklu), Cint, @@ -102,8 +146,14 @@ function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr, ) end -function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon}) +function klu_l_tsolve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{Cdouble}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_tsolve, libklu), Cint, @@ -112,8 +162,10 @@ function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, ) end -function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr}, - common::Ref{KluLCommon}) +function klu_l_free_numeric!( + numeric_ref::Ref{NumericPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_l_free_numeric, libklu), Cint, @@ -122,8 +174,69 @@ function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr}, ) end -function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, - symbolic::SymbolicPtr, common::Ref{KluLCommon}) +# Performance knobs / diagnostics — Int64 family. + +# klu_l_sort sorts the columns of the L and U factors in place. KLU's +# numeric phase stores L/U columns in arbitrary order; sorting once after +# the factor improves cache locality on every subsequent solve. Cheap +# (O(nnz_factor)) and idempotent. Call once after the first numeric +# factor; refactor preserves the sort. +function klu_l_sort( + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_sort, libklu), + Cint, + (SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + symbolic, numeric, common, + ) +end + +# klu_l_condest computes a 1-norm condition number estimate, populating +# `common.condest`. Costs roughly two extra solves. Useful for iterative +# refinement (informs tolerance choice) and as a diagnostic for near-singular +# matrices. Caller reads result from `cache.common[].condest`. +function klu_l_condest( + ap::Ptr{Int64}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_condest, libklu), + Cint, + (Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + ap, ax, symbolic, numeric, common, + ) +end + +# klu_l_rcond fills `common.rcond` with the cheap diagonal-ratio +# reciprocal condition estimate (min(|diag(U)|)/max(|diag(U)|)). Faster +# than condest but less reliable. +function klu_l_rcond( + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) + return @klu_lock ccall( + (:klu_l_rcond, libklu), + Cint, + (SymbolicPtr, NumericPtr, Ptr{KluLCommon}), + symbolic, numeric, common, + ) +end + +# Complex / Int64 +function klu_zl_factor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_factor, libklu), NumericPtr, @@ -132,42 +245,74 @@ function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, ) end -function klu_zl_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64}, - symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon}) +function klu_zl_refactor( + ap::Ptr{Int64}, + ai::Ptr{Int64}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr, + numeric::NumericPtr, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_refactor, libklu), Cint, - (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, NumericPtr, - Ptr{KluLCommon}), + ( + Ptr{Int64}, + Ptr{Int64}, + Ptr{ComplexF64}, + SymbolicPtr, + NumericPtr, + Ptr{KluLCommon}, + ), ap, ai, ax, symbolic, numeric, common, ) end -function klu_zl_solve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, common::Ref{KluLCommon}) +function klu_zl_solve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{ComplexF64}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_solve, libklu), Cint, - (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, - Ptr{KluLCommon}), + (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Ptr{KluLCommon}), symbolic, numeric, ldim, nrhs, b, common, ) end -function klu_zl_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr, - ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, conj_solve::Cint, - common::Ref{KluLCommon}) +function klu_zl_tsolve( + symbolic::SymbolicPtr, + numeric::NumericPtr, + ldim::Int64, + nrhs::Int64, + b::Ptr{ComplexF64}, + conj_solve::Cint, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_tsolve, libklu), Cint, - (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Cint, - Ptr{KluLCommon}), + ( + SymbolicPtr, + NumericPtr, + Int64, + Int64, + Ptr{ComplexF64}, + Cint, + Ptr{KluLCommon}, + ), symbolic, numeric, ldim, nrhs, b, conj_solve, common, ) end -function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr}, - common::Ref{KluLCommon}) +function klu_zl_free_numeric!( + numeric_ref::Ref{NumericPtr}, + common::Ref{KluLCommon}, +) return @klu_lock ccall( (:klu_zl_free_numeric, libklu), Cint, @@ -176,14 +321,305 @@ function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr}, ) end -# Status codes from klu.h. -const KLU_OK = 0 -const KLU_SINGULAR = 1 -const KLU_OUT_OF_MEMORY = -2 -const KLU_INVALID = -3 -const KLU_TOO_LARGE = -4 +# --------------------------------------------------------------------------- +# Int32 (int) family — mirror of the Int64 set above +# --------------------------------------------------------------------------- + +# Layout matches `klu_common` (Int32 path) in upstream `klu.h`. Differs from +# `KluLCommon` only in the four rank/diag fields, which are `int` instead of +# `SuiteSparse_long`. +mutable struct KluCommon + tol::Cdouble + memgrow::Cdouble + initmem_amd::Cdouble + initmem::Cdouble + maxwork::Cdouble + btf::Cint + ordering::Cint + scale::Cint + user_order::Ptr{Cvoid} + user_data::Ptr{Cvoid} + halt_if_singular::Cint + status::Cint + nrealloc::Cint + structural_rank::Cint + numerical_rank::Cint + singular_col::Cint + noffdiag::Cint + flops::Cdouble + rcond::Cdouble + condest::Cdouble + rgrowth::Cdouble + work::Cdouble + memusage::Csize_t + mempeak::Csize_t + KluCommon() = new() +end + +mutable struct KluSymbolic end +mutable struct KluNumeric end + +const SymbolicPtr32 = Ptr{KluSymbolic} +const NumericPtr32 = Ptr{KluNumeric} + +klu_defaults!(common::Ref{KluCommon}) = + @klu_lock ccall((:klu_defaults, libklu), Cint, (Ptr{KluCommon},), common) + +function klu_analyze( + n::Cint, + ap::Ptr{Cint}, + ai::Ptr{Cint}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_analyze, libklu), + SymbolicPtr32, + (Cint, Ptr{Cint}, Ptr{Cint}, Ptr{KluCommon}), + n, ap, ai, common, + ) +end + +function klu_free_symbolic!( + symbolic_ref::Ref{SymbolicPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_free_symbolic, libklu), + Cint, + (Ptr{SymbolicPtr32}, Ptr{KluCommon}), + symbolic_ref, common, + ) +end + +function klu_factor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_factor, libklu), + NumericPtr32, + (Ptr{Cint}, Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, Ptr{KluCommon}), + ap, ai, ax, symbolic, common, + ) +end + +function klu_refactor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_refactor, libklu), + Cint, + ( + Ptr{Cint}, + Ptr{Cint}, + Ptr{Cdouble}, + SymbolicPtr32, + NumericPtr32, + Ptr{KluCommon}, + ), + ap, ai, ax, symbolic, numeric, common, + ) +end + +function klu_solve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{Cdouble}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_solve, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_tsolve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{Cdouble}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_tsolve, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_free_numeric!( + numeric_ref::Ref{NumericPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_free_numeric, libklu), + Cint, + (Ptr{NumericPtr32}, Ptr{KluCommon}), + numeric_ref, common, + ) +end + +# Performance knobs / diagnostics — Int32 family. + +function klu_sort( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_sort, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + symbolic, numeric, common, + ) +end + +function klu_condest( + ap::Ptr{Cint}, + ax::Ptr{Cdouble}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_condest, libklu), + Cint, + (Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + ap, ax, symbolic, numeric, common, + ) +end + +function klu_rcond( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_rcond, libklu), + Cint, + (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}), + symbolic, numeric, common, + ) +end + +# Complex / Int32 +function klu_z_factor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_factor, libklu), + NumericPtr32, + (Ptr{Cint}, Ptr{Cint}, Ptr{ComplexF64}, SymbolicPtr32, Ptr{KluCommon}), + ap, ai, ax, symbolic, common, + ) +end + +function klu_z_refactor( + ap::Ptr{Cint}, + ai::Ptr{Cint}, + ax::Ptr{ComplexF64}, + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_refactor, libklu), + Cint, + ( + Ptr{Cint}, + Ptr{Cint}, + Ptr{ComplexF64}, + SymbolicPtr32, + NumericPtr32, + Ptr{KluCommon}, + ), + ap, ai, ax, symbolic, numeric, common, + ) +end + +function klu_z_solve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{ComplexF64}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_solve, libklu), + Cint, + ( + SymbolicPtr32, + NumericPtr32, + Cint, + Cint, + Ptr{ComplexF64}, + Ptr{KluCommon}, + ), + symbolic, numeric, ldim, nrhs, b, common, + ) +end + +function klu_z_tsolve( + symbolic::SymbolicPtr32, + numeric::NumericPtr32, + ldim::Cint, + nrhs::Cint, + b::Ptr{ComplexF64}, + conj_solve::Cint, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_tsolve, libklu), + Cint, + ( + SymbolicPtr32, + NumericPtr32, + Cint, + Cint, + Ptr{ComplexF64}, + Cint, + Ptr{KluCommon}, + ), + symbolic, numeric, ldim, nrhs, b, conj_solve, common, + ) +end + +function klu_z_free_numeric!( + numeric_ref::Ref{NumericPtr32}, + common::Ref{KluCommon}, +) + return @klu_lock ccall( + (:klu_z_free_numeric, libklu), + Cint, + (Ptr{NumericPtr32}, Ptr{KluCommon}), + numeric_ref, common, + ) +end + +# --------------------------------------------------------------------------- +# Shared error throw — dispatches on the common-struct width +# --------------------------------------------------------------------------- -function klu_throw(common::KluLCommon, op::AbstractString) +function klu_throw(common::Union{KluLCommon, KluCommon}, op::AbstractString) s = common.status s == KLU_SINGULAR && throw(LinearAlgebra.SingularException(Int(common.singular_col + 1))) diff --git a/src/KLUWrapper/solve_dense.jl b/src/KLUWrapper/solve_dense.jl index 7ce29b78c..85d758170 100644 --- a/src/KLUWrapper/solve_dense.jl +++ b/src/KLUWrapper/solve_dense.jl @@ -5,17 +5,19 @@ Solve `A · X = B` in place. `B::StridedVecOrMat{Tv}` must have first-dimension size equal to `cache.n` and unit stride in the first dimension. Multiple columns of `B` are handled in a single libklu call. """ -function solve!(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}} +function solve!( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}, +) where {Tv, Ti} is_factored(cache) || error("KLULinSolveCache: not factored yet.") n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) stride(B, 1) == 1 || throw(ArgumentError( "B must have unit stride in the first dimension.", )) - nrhs = Int64(size(B, 2)) + nrhs = size(B, 2) nrhs == 0 && return B # Snapshot KLU's preconditions plus identity info — gated by # `KLU_POOL_DEBUG`. `klu_l_solve` returns FALSE with `KLU_INVALID` when @@ -31,7 +33,7 @@ function solve!(cache::KLULinSolveCache{Tv}, pre_b_ptr = pointer(B) end ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common, + Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common, ) if ok == 0 @static if KLU_POOL_DEBUG @@ -57,21 +59,23 @@ In-place solve `Aᵀ · X = B` (or `Aᴴ · X = B` when `conjugate=true` on the complex path). Same shape requirements as `solve!`. The `conjugate` keyword is ignored on the real path. """ -function tsolve!(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}; conjugate::Bool = false, -) where {Tv <: Union{Float64, ComplexF64}} +function tsolve!( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}; + conjugate::Bool = false, +) where {Tv, Ti} is_factored(cache) || error("KLULinSolveCache: not factored yet.") n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) stride(B, 1) == 1 || throw(ArgumentError( "B must have unit stride in the first dimension.", )) - nrhs = Int64(size(B, 2)) + nrhs = size(B, 2) nrhs == 0 && return B ok = _tsolve_call( - Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common; + Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common; conjugate = conjugate, ) ok == 0 && klu_throw(cache.common[], "klu_tsolve") @@ -83,7 +87,9 @@ end Allocating solve, mirroring `LinearAlgebra.Factorization`'s API. """ -function Base.:\(cache::KLULinSolveCache{Tv}, - B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}} +function Base.:\( + cache::KLULinSolveCache{Tv, Ti}, + B::StridedVecOrMat{Tv}, +) where {Tv, Ti} return solve!(cache, copy(B)) end diff --git a/src/KLUWrapper/solve_sparse_rhs.jl b/src/KLUWrapper/solve_sparse_rhs.jl index a20679f24..089fa06a0 100644 --- a/src/KLUWrapper/solve_sparse_rhs.jl +++ b/src/KLUWrapper/solve_sparse_rhs.jl @@ -20,20 +20,23 @@ Not thread-safe (mutates per-cache scratch). Callers should serialize access through a per-cache lock if invoked from multiple threads. """ function solve_sparse!( - cache::KLULinSolveCache{Tv}, - B::SparseMatrixCSC{Tb, Int}, + cache::KLULinSolveCache{Tv, Ti}, + B::SparseMatrixCSC{Tb, <:Integer}, out::AbstractMatrix{Tv}; block::Int = SPARSE_RHS_DEFAULT_BLOCK, -) where {Tv <: Union{Float64, ComplexF64}, Tb <: Number} +) where {Tv, Ti, Tb <: Number} is_factored(cache) || error("KLULinSolveCache: not factored yet.") block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)")) n = _dim(cache) - size(B, 1) == n || throw(DimensionMismatch( - "size(B, 1) = $(size(B, 1)), cache n = $(n)", - )) - size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch( - "out has size $(size(out)); expected $((n, size(B, 2))).", + size(B, 1) == Int(n) || throw(DimensionMismatch( + "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))", )) + size(out, 1) == Int(n) && size(out, 2) == size(B, 2) || + throw( + DimensionMismatch( + "out has size $(size(out)); expected $((Int(n), size(B, 2))).", + ), + ) nb = size(B, 2) nb == 0 && return out @@ -71,7 +74,7 @@ function solve_sparse!( # helper) to keep the inner-loop closure from capturing the per-chunk # `npack`, which would force Julia to box it. ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, Int64(npack), + Tv, Ti, cache.symbolic, cache.numeric, n, npack, pointer(scratch), cache.common, ) if ok == 0 && cache.common[].status == KLU_INVALID @@ -79,7 +82,7 @@ function solve_sparse!( objectid(cache) _recover_factorization!(cache) ok = _solve_call( - Tv, cache.symbolic, cache.numeric, n, Int64(npack), + Tv, Ti, cache.symbolic, cache.numeric, n, npack, pointer(scratch), cache.common, ) end @@ -96,12 +99,13 @@ function solve_sparse!( end """Allocating wrapper around `solve_sparse!`.""" -function solve_sparse(cache::KLULinSolveCache{Tv}, - B::SparseMatrixCSC{<:Number, Int}; +function solve_sparse( + cache::KLULinSolveCache{Tv, Ti}, + B::SparseMatrixCSC{<:Number, <:Integer}; block::Int = SPARSE_RHS_DEFAULT_BLOCK, -) where {Tv <: Union{Float64, ComplexF64}} +) where {Tv, Ti} return solve_sparse!( - cache, B, Matrix{Tv}(undef, _dim(cache), size(B, 2)); + cache, B, Matrix{Tv}(undef, Int(_dim(cache)), size(B, 2)); block = block, ) end diff --git a/src/PowerNetworkMatrices.jl b/src/PowerNetworkMatrices.jl index 5239f5859..23ce6e4c5 100644 --- a/src/PowerNetworkMatrices.jl +++ b/src/PowerNetworkMatrices.jl @@ -89,8 +89,12 @@ import .KLUWrapper: n_valid, is_factored +include("AccelerateWrapper/AccelerateWrapper.jl") +import .AccelerateWrapper: AAFactorCache, aa_factorize, aa_spmm!, aa_spmv! + include("linalg_settings.jl") include("solver_dispatch.jl") +include("iterative_refinement.jl") function __init__() something(get_linalg_backend_check(), false) && check_linalg_backend() @@ -137,14 +141,12 @@ include("virtual_modf_calculations.jl") include("system_utils.jl") include("serialization.jl") -# Declare functions that will be defined by extensions -# These need to be declared so extensions can extend them +# Forward declarations for symbols still defined inside package extensions. +# AppleAccelerate-related functions live in `src/` now (no extension needed) +# and are not redeclared here. function _calculate_PTDF_matrix_MKLPardiso end -function _calculate_PTDF_matrix_AppleAccelerate end function _calculate_LODF_matrix_MKLPardiso end -function _calculate_LODF_matrix_AppleAccelerate end function _pardiso_sequential_LODF! end function _pardiso_single_LODF! end -function _create_apple_accelerate_factorization end end diff --git a/src/PowerflowMatrixTypes.jl b/src/PowerflowMatrixTypes.jl index 98d103289..cd1aa2ab9 100644 --- a/src/PowerflowMatrixTypes.jl +++ b/src/PowerflowMatrixTypes.jl @@ -1,7 +1,7 @@ const DC_ABA_Matrix_Factorized = ABA_Matrix{ Tuple{Vector{Int64}, Vector{Int64}}, Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}}, - KLULinSolveCache{Float64}, + KLULinSolveCache{Float64, Int64}, } const DC_ABA_Matrix_Unfactorized = ABA_Matrix{ Tuple{Vector{Int64}, Vector{Int64}}, diff --git a/src/common.jl b/src/common.jl index 70a1b50ff..c506170ad 100644 --- a/src/common.jl +++ b/src/common.jl @@ -6,6 +6,23 @@ function _add_to_collection!( return end +""" + _build_bus_to_valid_idx(n_buses, valid_ix) -> Vector{Int} + +Build the inverse of `valid_ix`: a length-`n_buses` vector where entry `b` +is the position of bus `b` inside `valid_ix`, or 0 if `b` is a reference +bus. Used by the Virtual\\* row-computation hot path so it can iterate the +nonzeros of a sparse `BA` column directly (O(nnz_col)) instead of scanning +the full bus axis (O(n_buses)) and bisecting the CSC for each entry. +""" +function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int}) + bus_to_valid_idx = zeros(Int, n_buses) + @inbounds for (i, b) in enumerate(valid_ix) + bus_to_valid_idx[b] = i + end + return bus_to_valid_idx +end + function _add_to_collection!( collection_tr3w::Vector{PSY.ThreeWindingTransformer}, transformer_tr3w::PSY.ThreeWindingTransformer, diff --git a/src/iterative_refinement.jl b/src/iterative_refinement.jl new file mode 100644 index 000000000..50e01f86e --- /dev/null +++ b/src/iterative_refinement.jl @@ -0,0 +1,153 @@ +import LinearAlgebra: norm, mul! + +""" + DEFAULT_REFINEMENT_MAX_ITER :: Int + +Default iteration cap for `solve_w_refinement!`. +""" +const DEFAULT_REFINEMENT_MAX_ITER = 25 + +# --- backend dispatch bridge --------------------------------------------------- +# +# The refinement body uses three primitives — in-place `solve!`, `is_factored`, +# and `_dim` — each of which is defined inside `KLUWrapper` and +# `AccelerateWrapper` as a *separate* function. The bridge below lets the +# shared body reach the right backend by multiple dispatch instead of an `isa` +# branch, and keeps the body itself backend-agnostic. + +_refine_solve!(K::KLULinSolveCache, r::StridedVecOrMat) = solve!(K, r) +_refine_solve!(K::AAFactorCache, r::StridedVecOrMat) = AccelerateWrapper.solve!(K, r) + +_refine_is_factored(K::KLULinSolveCache) = is_factored(K) +_refine_is_factored(K::AAFactorCache) = AccelerateWrapper.is_factored(K) + +_refine_dim(K::KLULinSolveCache) = Int(KLUWrapper._dim(K)) +_refine_dim(K::AAFactorCache) = AccelerateWrapper._dim(K) + +""" + solve_w_refinement!(cache, A, X, B; tol=…, max_iters=…) -> X + +Solve `A · X = B` in place using `cache` and apply iterative refinement +until `norm(B − A·X, 1) < norm(B, 1) * tol`, or until refinement stops +improving. `X` must be pre-allocated by the caller with the same shape as +`B`; it is overwritten with the refined solution. + +This is the non-allocating variant. For a one-shot allocating variant that +returns a fresh `X`, see `solve_w_refinement`. + +Supports `cache::KLULinSolveCache` (KLU backend, any `{Tv, Ti}`) and +`cache::AAFactorCache` (Apple Accelerate backend, `Cdouble` only). The +cache must already be factored (`is_factored(cache) == true`). The function +does not mutate `cache`'s factor; it only triggers in-place `solve!` calls +on a residual buffer. + +`max_iters` caps the refinement loop (default +`DEFAULT_REFINEMENT_MAX_ITER`). The default `tol` is `sqrt(eps(real(Tv)))`, +which is conservative for power-flow Newton-Raphson Jacobians. + +Useful when the cached factor is of an ill-conditioned matrix — e.g., +a Newton-Raphson Jacobian near a saddle point or a network reduction +interface — where a single back-solve carries enough error to slow NR +convergence. Cost per refinement iteration: one sparse matrix-vector +product plus one `solve!` against the cached factor. +""" +function solve_w_refinement!( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseArrays.SparseMatrixCSC{Tv, Ti}, + X::StridedVecOrMat{Tv}, + B::StridedVecOrMat{Tv}; + tol::Real = sqrt(eps(real(Tv))), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) where {Tv, Ti} + return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters) +end + +function solve_w_refinement!( + cache::AAFactorCache, + A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer}, + X::StridedVecOrMat{Cdouble}, + B::StridedVecOrMat{Cdouble}; + tol::Real = sqrt(eps(Cdouble)), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) + return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters) +end + +function _solve_w_refinement_body!( + cache, + A::SparseArrays.SparseMatrixCSC, + X::StridedVecOrMat, + B::StridedVecOrMat, + tol::Real, + max_iters::Int, +) + _refine_is_factored(cache) || error( + "solve_w_refinement!: cache must be factored. " * + "Call `full_factor!(cache, A)` first.", + ) + size(X) == size(B) || + throw(DimensionMismatch("X is $(size(X)); B is $(size(B)).")) + size(B, 1) == _refine_dim(cache) || throw( + DimensionMismatch( + "size(B, 1) = $(size(B, 1)); cache n = $(_refine_dim(cache)).", + ), + ) + + Tv = eltype(X) + fill!(X, zero(Tv)) + # `X = 0` ⇒ initial residual `r = B - A·0 = B`. Copy once; subsequent + # iterations reuse `r` in place via `mul!(r, A, X); @. r = B - r`, + # which avoids the temporary `A * X` allocation `B - A * X` would create + # each refinement step. + r = copy(B) + bNorm = norm(B, 1) + iters = 0 + while iters < max_iters && norm(r, 1) >= bNorm * tol + prev_err = norm(r, 1) + _refine_solve!(cache, r) + X .+= r + mul!(r, A, X) + @. r = B - r + iters += 1 + if norm(r, 1) > prev_err + @debug "Iterative refinement diverging; returning best-so-far." iters + return X + end + end + @debug "Iterative refinement converged." iters + return X +end + +""" + solve_w_refinement(cache, A, B; tol=…, max_iters=…) -> X + +Allocating wrapper around `solve_w_refinement!`. Allocates `X` matching +`B`'s shape, then refines. +""" +function solve_w_refinement( + cache::KLULinSolveCache{Tv, Ti}, + A::SparseArrays.SparseMatrixCSC{Tv, Ti}, + B::StridedVecOrMat{Tv}; + tol::Real = sqrt(eps(real(Tv))), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) where {Tv, Ti} + X = similar(B) + return solve_w_refinement!( + cache, A, X, B; + tol = tol, max_iters = max_iters, + ) +end + +function solve_w_refinement( + cache::AAFactorCache, + A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer}, + B::StridedVecOrMat{Cdouble}; + tol::Real = sqrt(eps(Cdouble)), + max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER, +) + X = similar(B) + return solve_w_refinement!( + cache, A, X, B; + tol = tol, max_iters = max_iters, + ) +end diff --git a/src/linalg_settings.jl b/src/linalg_settings.jl index c79962189..58428f8a5 100644 --- a/src/linalg_settings.jl +++ b/src/linalg_settings.jl @@ -1,7 +1,9 @@ -# Extensions are loaded when trigger packages (Pardiso, AppleAccelerate) are loaded +# The MKL/Pardiso path still uses the package-extension mechanism (Pardiso.jl +# is the only consumer-facing way to access the MKL Pardiso solver). The +# Apple Accelerate path no longer does — `AccelerateWrapper` is built in via +# a `@static if Sys.isapple()` gate. -# Check if MKL/Pardiso extension is available at runtime function _has_mkl_pardiso_ext() ext = Base.get_extension(@__MODULE__, :MKLPardisoExt) return !isnothing(ext) @@ -12,20 +14,21 @@ _mkl_pardiso_install_error() = Install the Pardiso package: julia> using Pkg; Pkg.add(\"Pardiso\")""" -# Check if AppleAccelerate extension is available at runtime -function _has_apple_accelerate_ext() - ext = Base.get_extension(@__MODULE__, :AppleAccelerateExt) - return !isnothing(ext) -end +_has_apple_accelerate_backend() = Sys.isapple() -_apple_accelerate_install_error() = - """The AppleAccelerate extension is not available. - This solver is only available on macOS. - Install AppleAccelerate: - julia> using Pkg; Pkg.add(\"AppleAccelerate\")""" +_apple_accelerate_unavailable_error() = + """The Apple Accelerate sparse backend is macOS-only (Sys.isapple() returned false). + Use the KLU solver (the default) on non-Apple platforms.""" -# _create_apple_accelerate_factorization is defined in ext/AppleAccelerateExt.jl -# when AppleAccelerate package is loaded +""" + _default_linear_solver() -> String + +Default sparse linear solver name. Returns "AppleAccelerate" on macOS (Apple's +built-in libSparse via `AccelerateWrapper`) and "KLU" elsewhere. Used as the +default for the `linear_solver` keyword on PTDF / LODF / VirtualPTDF / +VirtualLODF / VirtualMODF constructors. +""" +_default_linear_solver() = Sys.isapple() ? "AppleAccelerate" : "KLU" "Set a preference of the backend library for sparse linear algebra operations." function set_linalg_backend_preference(linalglib::Union{String, Nothing}) @@ -125,14 +128,12 @@ function check_linalg_backend() end end - if _has_apple_accelerate_ext() + if _has_apple_accelerate_backend() @info go_msg("AppleAccelerate") else if user_linalg_backend == "AppleAccelerate" - @warn yo_msg("AppleAccelerate") + @warn "AppleAccelerate is only available on Apple platforms." end - @info no_msg("AppleAccelerate") - @info "See https://github.com/JuliaLinearAlgebra/AppleAccelerate.jl" end end end diff --git a/src/lodf_calculations.jl b/src/lodf_calculations.jl index 78dae430c..95754181f 100644 --- a/src/lodf_calculations.jl +++ b/src/lodf_calculations.jl @@ -64,6 +64,40 @@ get_network_reduction_data(M::LODF) = M.network_reduction_data get_arc_lookup(M::LODF) = M.lookup[1] stores_transpose(::LODF) = true +# --- Demand-matrix short-circuit --------------------------------------------- +# +# The LODF computation builds a *diagonal* "demand" matrix `D = diag(m_V)` +# where `m_V[i] = 1 - PTDF·A[i, i]` (clamped to 1.0 at `LODF_ENTRY_TOLERANCE` +# to avoid divide-by-zero when an outage islands the line). The original +# code factored `D` and ran a triangular solve `D · X = ptdf_denominator`; +# that's a `factor + back-solve` over a diagonal, which collapses to +# element-wise row scaling. KLU's BTF short-circuits this internally so the +# overhead was modest; AA's libSparse and LAPACK's `getrf!`/`getrs!` do +# not, so the previous code was 3–5× slower on AA and order-of-magnitude +# slower on DENSE than necessary. Replace both with a direct row scaling. + +function _build_lodf_demand(ptdf_denominator::AbstractMatrix{Float64}, linecount::Int) + m_V = Vector{Float64}(undef, linecount) + @inbounds for i in 1:linecount + d = 1.0 - ptdf_denominator[i, i] + m_V[i] = d < LODF_ENTRY_TOLERANCE ? 1.0 : d + end + return m_V +end + +function _apply_lodf_demand!(M::AbstractMatrix{Float64}, m_V::Vector{Float64}) + IS.@assert_op size(M, 1) == length(m_V) + IS.@assert_op size(M, 1) == size(M, 2) + # `inv_dem .* M` mirrors what the triangular solve did internally — + # one reciprocal per row, then a row-wise multiply. The broadcast + # `M .*= inv_dem` scales each row `i` by `inv_dem[i]` because the + # length-n vector broadcasts down the first dimension. + inv_dem = 1.0 ./ m_V + M .*= inv_dem + M[SparseArrays.diagind(M)] .= -1.0 + return M +end + function _buildlodf( a::SparseArrays.SparseMatrixCSC{Int8, Int}, ptdf::Matrix{Float64}, @@ -94,7 +128,7 @@ function _buildlodf( ptdf::Matrix{Float64}, ::AppleAccelerateSolver, ) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) return _calculate_LODF_matrix_AppleAccelerate(a, ptdf) end @@ -131,20 +165,8 @@ function _calculate_LODF_matrix_KLU( solve_sparse!(k, a_t_valid, view(first_, valid_ix, :)) ptdf_denominator = first_' * ba - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator[iline, iline]) - end - end - Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V)) - solve!(Dem_cache, ptdf_denominator) - ptdf_denominator[SparseArrays.diagind(ptdf_denominator)] .= -1.0 + m_V = _build_lodf_demand(ptdf_denominator, linecount) + _apply_lodf_demand!(ptdf_denominator, m_V) return ptdf_denominator end @@ -154,22 +176,9 @@ function _calculate_LODF_matrix_KLU( ) linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_I = Int[] - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_I, iline) - push!(m_V, 1.0) - else - push!(m_I, iline) - push!(m_V, 1 - ptdf_denominator_t[iline, iline]) - end - end - Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V)) + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) lodf_t = copy(ptdf_denominator_t) - solve!(Dem_cache, lodf_t) - lodf_t[SparseArrays.diagind(lodf_t)] .= -1.0 - + _apply_lodf_demand!(lodf_t, m_V) return lodf_t end @@ -179,29 +188,41 @@ function _calculate_LODF_matrix_DENSE( ) linecount = size(ptdf, 2) ptdf_denominator_t = a * ptdf - m_V = Float64[] - for iline in 1:linecount - if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE - push!(m_V, 1.0) - else - push!(m_V, 1.0 - ptdf_denominator_t[iline, iline]) - end - end - (mV, bipiv, binfo) = getrf!(Matrix(LinearAlgebra.diagm(m_V))) - _binfo_check(binfo) - getrs!('N', mV, bipiv, ptdf_denominator_t) - ptdf_denominator_t[LinearAlgebra.diagind(ptdf_denominator_t)] .= -1.0 + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) + _apply_lodf_demand!(ptdf_denominator_t, m_V) return ptdf_denominator_t end # _pardiso_sequential_LODF!, _pardiso_single_LODF!, _calculate_LODF_matrix_MKLPardiso # are defined in ext/MKLPardisoExt.jl when the Pardiso package is loaded -# _calculate_LODF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl -# when the AppleAccelerate package is loaded +@static if Sys.isapple() + """ + Function for internal use only. + + Computes the LODF matrix using the internal Apple Accelerate backend + (`AccelerateWrapper`). Available only on macOS. Shape mirrors + `_calculate_LODF_matrix_KLU(a, ptdf)` exactly: factor the diagonal "demand" + matrix `diag(1 - PTDF·A)` and solve in place against `a · ptdf`. + + # Arguments + - `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix + - `ptdf::Matrix{Float64}`: PTDF matrix + """ + function _calculate_LODF_matrix_AppleAccelerate( + a::SparseArrays.SparseMatrixCSC{Int8, Int}, + ptdf::Matrix{Float64}, + ) + linecount = size(ptdf, 2) + ptdf_denominator_t = a * ptdf + m_V = _build_lodf_demand(ptdf_denominator_t, linecount) + _apply_lodf_demand!(ptdf_denominator_t, m_V) + return ptdf_denominator_t + end +end """ - LODF(sys::PSY.System; linear_solver::String = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + LODF(sys::PSY.System; linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Line Outage Distribution Factor (LODF) matrix from a PowerSystems.System by computing the sensitivity of line flows to single line outages. This is the primary constructor for LODF @@ -211,7 +232,7 @@ analysis starting from system data. - `sys::PSY.System`: The power system from which to construct the LODF matrix # Keyword Arguments -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -258,16 +279,12 @@ where A is the incidence matrix and PTDF is the power transfer distribution fact """ function LODF( sys::PSY.System; - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs..., ) - Ymatrix = Ybus( - sys; - network_reductions = network_reductions, - kwargs..., - ) + Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...) A = IncidenceMatrix(Ymatrix) BA = BA_Matrix(Ymatrix) ptdf = PTDF(A, BA) @@ -275,7 +292,7 @@ function LODF( end """ - LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = "KLU", tol::Float64 = eps()) + LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = _default_linear_solver(), tol::Float64 = eps()) Construct a Line Outage Distribution Factor (LODF) matrix from existing incidence and PTDF matrices. This constructor is more efficient when the prerequisite matrices are already available. @@ -285,7 +302,7 @@ This constructor is more efficient when the prerequisite matrices are already av - `PTDFm::PTDF`: The power transfer distribution factor matrix (should be non-sparsified for accuracy) # Keyword Arguments -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for the LODF matrix (not applied to input PTDF) @@ -327,7 +344,7 @@ where: function LODF( A::IncidenceMatrix, PTDFm::PTDF; - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), ) solver = resolve_linear_solver(linear_solver) @@ -384,7 +401,9 @@ efficient when the prerequisite matrices with factorization are already availabl # Keyword Arguments - `linear_solver::String = "KLU"`: - Linear solver algorithm for matrix computations. Currently only "KLU" is supported + This constructor is intentionally KLU-only because `ABA.K` is always a + KLU factorization. The keyword is kept for API consistency; passing any + other value will error. - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements @@ -431,6 +450,10 @@ function LODF( linear_solver::String = "KLU", tol::Float64 = eps(), ) + # NOTE: ABA.K is always a KLU factorization, so this constructor is + # KLU-only regardless of the `linear_solver` argument. The kwarg is kept + # for API consistency; passing anything other than "KLU" will error in + # `_buildlodf`. if !( isequal(A.network_reduction_data, BA.network_reduction_data) && isequal(BA.network_reduction_data, ABA.network_reduction_data) @@ -443,8 +466,7 @@ function LODF( subnetwork_axes = make_arc_arc_subnetwork_axes(A) ax_ref = make_ax_ref(get_arc_axis(A)) if tol > eps() - lodf_t = - _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver) + lodf_t = _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver) return LODF( sparsify(lodf_t, tol), (get_arc_axis(A), get_arc_axis(A)), diff --git a/src/ptdf_calculations.jl b/src/ptdf_calculations.jl index 132fe16b4..d6ccec700 100644 --- a/src/ptdf_calculations.jl +++ b/src/ptdf_calculations.jl @@ -111,7 +111,7 @@ function _buildptdf_from_matrices( ref_bus_positions::Set{Int}, dist_slack::Vector{Float64}, ::AppleAccelerateSolver) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) return _calculate_PTDF_matrix_AppleAccelerate(A, BA, ref_bus_positions, dist_slack) end @@ -230,11 +230,63 @@ end # _calculate_PTDF_matrix_MKLPardiso is defined in ext/MKLPardisoExt.jl # when Pardiso package is loaded -# _calculate_PTDF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl -# when AppleAccelerate package is loaded +@static if Sys.isapple() + """ + Function for internal use only. + + Computes the PTDF matrix using the internal Apple Accelerate backend + (`AccelerateWrapper`). Available only on macOS — non-Apple callers are + rejected by `_create_factorization` before reaching this entry. Shape + mirrors `_calculate_PTDF_matrix_KLU`: factor ABA via LDLT, then solve + `ABA · X = BA[valid_ix, :]` via the block-packed `solve_sparse!`. + + # Arguments + - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix + - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix + - `ref_bus_positions::Set{Int}`: indexes of reference slack buses + - `dist_slack::Vector{Float64}`: distributed-slack weights + """ + function _calculate_PTDF_matrix_AppleAccelerate( + A::SparseArrays.SparseMatrixCSC{Int8, Int}, + BA::SparseArrays.SparseMatrixCSC{Float64, Int}, + ref_bus_positions::Set{Int}, + dist_slack::Vector{Float64}, + ) + linecount = size(BA, 2) + buscount = size(BA, 1) + if !isempty(dist_slack) && length(ref_bus_positions) != 1 + error( + "Distributed slack is not supported for systems with multiple reference buses.", + ) + end + if !isempty(dist_slack) && length(dist_slack) != buscount + error("Distributed bus specification doesn't match the number of buses.") + end + length(ref_bus_positions) < buscount || error( + "All buses are reference buses; PTDF is not defined.", + ) + + ABA = calculate_ABA_matrix(A, BA, ref_bus_positions) + # ABA = Aᵀ B A is symmetric by construction; skip the structural check. + cache = AccelerateWrapper.aa_factorize(ABA; check_symmetry = false) + valid_ix = setdiff(1:buscount, ref_bus_positions) + PTDFm_t = zeros(buscount, linecount) + AccelerateWrapper.solve_sparse!( + cache, + BA[valid_ix, :], + view(PTDFm_t, valid_ix, :), + ) + + isempty(dist_slack) && return PTDFm_t + + @info "Distributed bus" + slack_array = reshape(dist_slack ./ sum(dist_slack), 1, buscount) + return PTDFm_t .- (slack_array * PTDFm_t) + end +end """ - PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Power Transfer Distribution Factor (PTDF) matrix from a PowerSystems.System by computing the sensitivity of transmission line flows to bus power injections. This is the primary constructor @@ -247,7 +299,7 @@ for PTDF analysis starting from system data. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling. Empty dictionary uses single slack bus (default behavior) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -300,7 +352,7 @@ where A is the incidence matrix and B is the susceptance matrix. """ function PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), kwargs..., ) @@ -312,7 +364,7 @@ function PTDF(sys::PSY.System; end """ - PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) + PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...) Construct a Power Transfer Distribution Factor (PTDF) matrix from existing Ybus matrix. This constructor is more efficient when the prerequisite matrices are already available and provides @@ -325,7 +377,7 @@ direct control over the underlying matrix computations. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling. Empty dictionary uses single slack bus (default behavior) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -370,7 +422,7 @@ where A is the incidence matrix and B is the susceptance matrix. """ function PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), ) A = IncidenceMatrix(ybus) @@ -385,7 +437,7 @@ function PTDF(ybus::Ybus; end """ - PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps()) + PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps()) Construct a Power Transfer Distribution Factor (PTDF) matrix from existing incidence and BA matrices. This constructor is more efficient when the prerequisite matrices are already available and provides @@ -399,7 +451,7 @@ direct control over the underlying matrix computations. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary mapping bus numbers to distributed slack participation factors. Empty dictionary uses single slack bus (reference bus from matrices) -- `linear_solver::String = "KLU"`: +- `linear_solver::String = _default_linear_solver()`: Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso" - `tol::Float64 = eps()`: Sparsification tolerance for dropping small matrix elements to reduce memory usage @@ -451,7 +503,7 @@ function PTDF( A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver = "KLU", + linear_solver = _default_linear_solver(), tol::Float64 = eps(), ) dist_slack_vector = if !(isempty(dist_slack)) diff --git a/src/solver_dispatch.jl b/src/solver_dispatch.jl index 7a31ba86c..4998d6c97 100644 --- a/src/solver_dispatch.jl +++ b/src/solver_dispatch.jl @@ -10,17 +10,20 @@ # the callback. One factor + one cache per Virtual matrix is what # remains — same throughput as the pool variant once the global lock # is in place. +# +# Apple's libSparse has no documented cross-handle corruption issue +# analogous to `_LIBKLU_LOCK`; the per-cache `solver_lock` we acquire +# here is sufficient for the `AAFactorCache` backend. """ with_solver(f, K, work_ba_col, temp_data, solver_lock) -> result Acquire `solver_lock`, then invoke `f(K, work_ba_col[1], temp_data[1])`. -Two overloads: one specialized on `KLULinSolveCache{Float64}` (the KLU -backend), one generic for any other factorization (`AppleAccelerate.AAFactorization`, -etc.). Both serialize through `solver_lock`; the per-cache scratch -slot at index 1 is the only slot — `work_ba_col` and `temp_data` are -single-element vectors, kept as `Vector{Vector{Float64}}` because -`_solve_factorization` and the AppleAccelerate extension are typed on +Three overloads: `KLULinSolveCache{Float64}` (KLU backend), `AAFactorCache` +(Apple Accelerate backend), and a generic fallback. All serialize through +`solver_lock`; the per-cache scratch slot at index 1 is the only slot — +`work_ba_col` and `temp_data` are single-element vectors, kept as +`Vector{Vector{Float64}}` because `_solve_factorization` is typed on `Vector{Float64}` and the two buffers have different lengths (`n_buses` vs. `n_buses - n_ref_buses`). """ @@ -34,6 +37,16 @@ function with_solver( return @lock solver_lock f(K, work_ba_col[1], temp_data[1]) end +function with_solver( + f::F, + K::AAFactorCache, + work_ba_col::Vector{Vector{Float64}}, + temp_data::Vector{Vector{Float64}}, + solver_lock::ReentrantLock, +) where {F} + return @lock solver_lock f(K, work_ba_col[1], temp_data[1]) +end + function with_solver( f::F, K::KT, diff --git a/src/virtual_lodf_calculations.jl b/src/virtual_lodf_calculations.jl index d6f4be258..922272ed4 100644 --- a/src/virtual_lodf_calculations.jl +++ b/src/virtual_lodf_calculations.jl @@ -52,6 +52,10 @@ callers can issue requests concurrently; the libklu work runs one at a time. - `valid_ix::Vector{Int}`: Vector containing the row/columns indices of matrices related the buses which are not slack ones. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot + path iterate the nonzeros of a `BA` column directly. - `temp_data::Vector{Vector{Float64}}`: Single-element scratch vector kept as a `Vector{Vector{Float64}}` for uniform `with_solver` callback signatures. @@ -81,6 +85,7 @@ struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <: axes::Ax lookup::L valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} temp_data::Vector{Vector{Float64}} cache::RowCache cache_lock::ReentrantLock @@ -110,7 +115,7 @@ function Base.show(io::IO, ::MIME{Symbol("text/plain")}, array::VirtualLODF) end function _get_PTDF_A_diag( - K::KLULinSolveCache{Float64}, + K, BA::SparseArrays.SparseMatrixCSC{Float64, Int}, A::SparseArrays.SparseMatrixCSC{Int8, Int}, ref_bus_positions::Set{Int}, @@ -119,33 +124,39 @@ function _get_PTDF_A_diag( n_buses = size(BA, 1) diag_ = zeros(n_branches) - # Pre-compute valid indices (non-reference buses) + # Pre-compute valid indices (non-reference buses) and their inverse map. valid_ix = setdiff(1:n_buses, ref_bus_positions) n_valid = length(valid_ix) + bus_to_valid_idx = _build_bus_to_valid_idx(n_buses, valid_ix) # Pre-allocate work arrays for efficiency ba_col = zeros(n_valid) ptdf_row = zeros(n_buses) + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) # For each branch, compute PTDF row and dot with incidence column for i in 1:n_branches - # Extract BA column for valid indices + # Extract BA[:, i] non-zeros into ba_col at non-ref positions. + # Sparse-only iteration — typically 2 nonzeros per arc — avoids + # the O(n_valid) scan of the full bus axis. fill!(ba_col, 0.0) - for idx in 1:n_valid - bus_idx = valid_ix[idx] - ba_col[idx] = BA[bus_idx, i] + @inbounds for k in SparseArrays.nzrange(BA, i) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + ba_col[valid_i] = ba_nz[k] end - solve!(K, ba_col) + _solve_factorization(K, ba_col) # Map back to full bus indices fill!(ptdf_row, 0.0) - for idx in 1:n_valid + @inbounds for idx in 1:n_valid ptdf_row[valid_ix[idx]] = ba_col[idx] end # Compute diagonal element: sum of PTDF[i,j] * A[i,j] for all buses j - for j in 1:n_buses + @inbounds for j in 1:n_buses diag_[i] += ptdf_row[j] * A[i, j] end end @@ -222,6 +233,9 @@ struct with an empty cache. PSY system for which the matrix is constructed # Keyword Arguments +- `linear_solver::String = _default_linear_solver()`: Linear solver for the + ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to + "AppleAccelerate" on macOS and "KLU" elsewhere. - `network_reduction::NetworkReduction`: Structure containing the details of the network reduction applied when computing the matrix - `kwargs...`: @@ -230,6 +244,7 @@ struct with an empty cache. function VirtualLODF( sys::PSY.System; dist_slack::Vector{Float64} = Float64[], + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -239,6 +254,7 @@ function VirtualLODF( if length(dist_slack) != 0 @info "Distributed bus" end + solver = resolve_linear_solver(linear_solver) Ymatrix = Ybus( sys; network_reductions = network_reductions, @@ -253,7 +269,7 @@ function VirtualLODF( subnetwork_axes = make_arc_arc_subnetwork_axes(A) BA = BA_Matrix(Ymatrix) ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions)) - K = klu_factorize(ABA) + K = _create_factorization(solver, ABA) bus_ax = get_bus_axis(A) valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) @@ -281,6 +297,7 @@ function VirtualLODF( # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`. temp_data = [zeros(length(bus_ax))] work_ba_col = [zeros(length(valid_ix))] + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) return VirtualLODF( K, @@ -294,6 +311,7 @@ function VirtualLODF( axes, look_up, valid_ix, + bus_to_valid_idx, temp_data, empty_cache, ReentrantLock(), @@ -341,8 +359,17 @@ function _compute_lodf_row(vlodf::VirtualLODF, row::Int)::Vector{Float64} return with_solver( vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock, ) do K_solver, work_ba_col, temp_data - @inbounds for i in eachindex(vlodf.valid_ix) - work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row] + # Sparse-only extraction: iterate BA[:, row] non-zeros (typically + # 2 per arc) instead of scanning the full bus axis. + fill!(work_ba_col, 0.0) + BA = vlodf.BA + bus_to_valid_idx = vlodf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, row) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) @@ -472,9 +499,17 @@ function _getindex_partial( return with_solver( vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock, ) do K_solver, work_ba_col, temp_data - # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve. - @inbounds for i in eachindex(vlodf.valid_ix) - work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx] + # Steps 1-2: Compute B⁻¹(b_e · ν_e) via sparse-only BA-column + # extraction + solve. + fill!(work_ba_col, 0.0) + BA = vlodf.BA + bus_to_valid_idx = vlodf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, arc_idx) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) diff --git a/src/virtual_modf_calculations.jl b/src/virtual_modf_calculations.jl index cbe596087..13c18cd9e 100644 --- a/src/virtual_modf_calculations.jl +++ b/src/virtual_modf_calculations.jl @@ -45,6 +45,10 @@ cache and skips the recomputation. Tuple of lookup dictionaries for indexing. - `valid_ix::Vector{Int}`: Indices of non-reference buses. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the + Woodbury kernel iterate the nonzeros of a `BA` column directly. - `contingency_cache::Dict{Base.UUID, ContingencySpec}`: Resolved contingencies keyed by outage UUID. - `woodbury_cache::Dict{NetworkModification, WoodburyFactors}`: @@ -85,6 +89,7 @@ struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <: axes::Ax lookup::L valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} contingency_cache::Dict{Base.UUID, ContingencySpec} woodbury_cache::Dict{NetworkModification, WoodburyFactors} row_caches::Dict{NetworkModification, RowCache} @@ -134,6 +139,7 @@ function _compute_woodbury_factors( mat.BA, mat.arc_susceptances, mat.valid_ix, + mat.bus_to_valid_idx, modifications, ) end @@ -157,6 +163,7 @@ function _apply_woodbury_correction( mat.BA, mat.arc_susceptances, mat.valid_ix, + mat.bus_to_valid_idx, monitored_idx, wf, ) @@ -206,6 +213,9 @@ Outage supplemental attributes found in the system. # Keyword Arguments - `dist_slack::Vector{Float64}`: Distributed slack weights (default: empty) +- `linear_solver::String = _default_linear_solver()`: Linear solver for the + ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to + "AppleAccelerate" on macOS and "KLU" elsewhere. - `tol::Float64`: Tolerance for row sparsification (default: eps()) - `max_cache_size::Int`: Max cache size in MiB per contingency (default: MAX_CACHE_SIZE_MiB) - `network_reductions::Vector{NetworkReduction}`: Network reductions to apply @@ -213,6 +223,7 @@ Outage supplemental attributes found in the system. function VirtualMODF( sys::PSY.System; dist_slack::Vector{Float64} = Float64[], + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, network_reductions::Vector{NetworkReduction} = NetworkReduction[], @@ -222,6 +233,7 @@ function VirtualMODF( if length(dist_slack) != 0 @info "Distributed bus" end + solver = resolve_linear_solver(linear_solver) # Build network matrices (same path as VirtualLODF) Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...) @@ -238,9 +250,10 @@ function VirtualMODF( BA = BA_Matrix(Ymatrix) ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions)) - K = klu_factorize(ABA) + K = _create_factorization(solver, ABA) valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) # PTDF diagonal precompute runs serially on the dispatcher thread. PTDF_A_diag = _get_PTDF_A_diag(K, BA.data, A.data, Set(ref_bus_positions)) @@ -264,6 +277,7 @@ function VirtualMODF( axes, look_up, valid_ix, + bus_to_valid_idx, Dict{Base.UUID, ContingencySpec}(), Dict{NetworkModification, WoodburyFactors}(), Dict{NetworkModification, RowCache}(), diff --git a/src/virtual_ptdf_calculations.jl b/src/virtual_ptdf_calculations.jl index 89b15df62..0396009d9 100644 --- a/src/virtual_ptdf_calculations.jl +++ b/src/virtual_ptdf_calculations.jl @@ -21,8 +21,8 @@ JuMP-side work (in callers) parallelizes freely. # Arguments - `K`: LU factorization of the ABA matrix. A `KLULinSolveCache{Float64}` for - the default KLU solver, or an `AppleAccelerate.AAFactorization{Float64}` - when the AppleAccelerate extension is loaded. + the default KLU solver, or an `AccelerateWrapper.AAFactorCache` when the + AppleAccelerate backend is selected on macOS. - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix - `ref_bus_positions::Set{Int}`: @@ -48,6 +48,11 @@ JuMP-side work (in callers) parallelizes freely. - `valid_ix::Vector{Int}`: Vector containing the row/columns indices of matrices related the buses which are not slack ones. +- `bus_to_valid_idx::Vector{Int}`: + Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus + `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot + path iterate the nonzeros of a `BA` column instead of scanning the + full bus axis. - `cache::RowCache`: Cache where PTDF rows are stored. - `cache_lock::ReentrantLock`: @@ -80,6 +85,7 @@ struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K} <: lookup::L temp_data::Vector{Vector{Float64}} valid_ix::Vector{Int} + bus_to_valid_idx::Vector{Int} cache::RowCache cache_lock::ReentrantLock subnetwork_axes::Dict{Int, Ax} @@ -120,8 +126,9 @@ struct with an empty cache. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary of weights to be used as distributed slack bus. The distributed slack dictionary must have the same number of entries as the number of buses. -- `linear_solver::String = "KLU"`: - Linear solver to use for factorization. Options: "KLU", "AppleAccelerate" +- `linear_solver::String = _default_linear_solver()`: + Linear solver to use for factorization. Options: "KLU", "AppleAccelerate". + Defaults to "AppleAccelerate" on macOS and "KLU" elsewhere. - `tol::Float64 = eps()`: Tolerance related to sparsification and values to drop. - `max_cache_size::Int`: @@ -136,7 +143,7 @@ struct with an empty cache. function VirtualPTDF( sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -172,8 +179,9 @@ function _create_factorization( ::AppleAccelerateSolver, ABA::SparseArrays.SparseMatrixCSC{Float64, Int}, ) - _has_apple_accelerate_ext() || error(_apple_accelerate_install_error()) - return _create_apple_accelerate_factorization(ABA) + _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error()) + # ABA = Aᵀ B A is symmetric by construction; skip the structural check. + return AccelerateWrapper.aa_factorize(ABA; check_symmetry = false) end function _create_factorization( @@ -197,8 +205,9 @@ The return is a VirtualPTDF struct with an empty cache. - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`: Dictionary of weights to be used as distributed slack bus. The distributed slack dictionary must have the same number of entries as the number of buses. -- `linear_solver::String = "KLU"`: - Linear solver to use for factorization. Options: "KLU", "AppleAccelerate" +- `linear_solver::String = _default_linear_solver()`: + Linear solver to use for factorization. Options: "KLU", "AppleAccelerate". + Defaults to "AppleAccelerate" on macOS and "KLU" elsewhere. - `tol::Float64 = eps()`: Tolerance related to sparsification and values to drop. - `max_cache_size::Int`: @@ -209,7 +218,7 @@ The return is a VirtualPTDF struct with an empty cache. function VirtualPTDF( ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), - linear_solver::String = "KLU", + linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), max_cache_size::Int = MAX_CACHE_SIZE_MiB, persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(), @@ -261,6 +270,7 @@ function VirtualPTDF( # `Vector{Vector{Float64}}` so `with_solver`'s callback signature # stays uniform across solver backends. valid_ix = setdiff(1:length(bus_ax), ref_bus_positions) + bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix) temp_data = [zeros(length(bus_ax))] work_ba_col = [zeros(length(valid_ix))] @@ -277,6 +287,7 @@ function VirtualPTDF( look_up, temp_data, valid_ix, + bus_to_valid_idx, empty_cache, ReentrantLock(), subnetwork_axes, @@ -321,17 +332,18 @@ if isdefined(Base, :print_array) # 0.7 and later Base.print_array(io::IO, X::VirtualPTDF) = "VirtualPTDF" end -# Helper function to solve with different factorization types. The -# `KLULinSolveCache` overload solves in place (zero-allocation hot path); -# the generic fallback delegates to `\` and is extended by the -# AppleAccelerate extension for `AAFactorization`. +# Helper function to solve with different factorization types. Both +# overloads solve in place (zero-allocation hot path). The KLU and Apple +# Accelerate backends are the only solvers supported here; adding a new +# backend requires extending this method. function _solve_factorization(K::KLULinSolveCache{Float64}, b::Vector{Float64}) solve!(K, b) return b end -function _solve_factorization(K, b::Vector{Float64}) - return K \ b +function _solve_factorization(K::AAFactorCache, b::Vector{Float64}) + AccelerateWrapper.solve!(K, b) + return b end function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64} @@ -350,12 +362,23 @@ function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64} return with_solver( vptdf.K, vptdf.work_ba_col, vptdf.temp_data, vptdf.solver_lock, ) do K_solver, work_ba_col, temp_data - valid_ix = vptdf.valid_ix - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = vptdf.BA[valid_ix[i], row] + # Extract BA[:, row] non-zeros into work_ba_col at non-ref-bus + # positions. Iterates only the nonzeros of the BA column (typically + # 2 per arc) instead of scanning the full bus axis and bisecting + # the CSC for each entry. + fill!(work_ba_col, 0.0) + BA = vptdf.BA + bus_to_valid_idx = vptdf.bus_to_valid_idx + ba_rv = SparseArrays.rowvals(BA) + ba_nz = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, row) + valid_i = bus_to_valid_idx[ba_rv[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz[k] end lin_solve = _solve_factorization(K_solver, work_ba_col) fill!(temp_data, 0.0) + valid_ix = vptdf.valid_ix @inbounds for i in eachindex(valid_ix) temp_data[valid_ix[i]] = lin_solve[i] end diff --git a/src/woodbury_kernel.jl b/src/woodbury_kernel.jl index ad29ba3b4..7f183e125 100644 --- a/src/woodbury_kernel.jl +++ b/src/woodbury_kernel.jl @@ -83,6 +83,7 @@ function _compute_woodbury_factors_impl( BA::SparseArrays.SparseMatrixCSC{Float64, Int}, arc_sus::Vector{Float64}, valid_ix::Vector{Int}, + bus_to_valid_idx::Vector{Int}, modifications::Tuple{Vararg{ArcModification}}, )::WoodburyFactors M = length(modifications) @@ -97,13 +98,19 @@ function _compute_woodbury_factors_impl( # Compute Z[:,j] = B⁻¹ν_j for each modified arc Z = Matrix{Float64}(undef, n_bus, M) + ba_rv_outer = SparseArrays.rowvals(BA) + ba_nz_outer = SparseArrays.nonzeros(BA) for (j, mod) in enumerate(modifications) e = mod.arc_index b_e = arc_sus[e] - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = BA[valid_ix[i], e] + # Sparse-only extraction of BA[:, e] into work_ba_col. + fill!(work_ba_col, 0.0) + @inbounds for k in SparseArrays.nzrange(BA, e) + valid_i = bus_to_valid_idx[ba_rv_outer[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz_outer[k] end lin_solve = _solve_factorization(K, work_ba_col) @@ -159,6 +166,7 @@ function _apply_woodbury_correction_impl( BA::SparseArrays.SparseMatrixCSC{Float64, Int}, arc_sus::Vector{Float64}, valid_ix::Vector{Int}, + bus_to_valid_idx::Vector{Int}, monitored_idx::Int, wf::WoodburyFactors, )::Vector{Float64} @@ -177,10 +185,15 @@ function _apply_woodbury_correction_impl( return zeros(n_bus) end - # z_m = B⁻¹ν_m / b_mon_pre via KLU solve on BA column + # z_m = B⁻¹ν_m / b_mon_pre via sparse-only BA-column extraction + solve. b_mon_pre = arc_sus[monitored_idx] - @inbounds for i in eachindex(valid_ix) - work_ba_col[i] = BA[valid_ix[i], monitored_idx] + fill!(work_ba_col, 0.0) + ba_rv_mon = SparseArrays.rowvals(BA) + ba_nz_mon = SparseArrays.nonzeros(BA) + @inbounds for k in SparseArrays.nzrange(BA, monitored_idx) + valid_i = bus_to_valid_idx[ba_rv_mon[k]] + valid_i > 0 || continue + work_ba_col[valid_i] = ba_nz_mon[k] end lin_solve = _solve_factorization(K, work_ba_col) @@ -226,7 +239,8 @@ function _compute_woodbury_factors( ) do K_solver, work_ba_col, temp_data _compute_woodbury_factors_impl( K_solver, work_ba_col, temp_data, - mat.BA, mat.arc_susceptances, mat.valid_ix, modifications, + mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx, + modifications, ) end end @@ -241,7 +255,8 @@ function _apply_woodbury_correction( ) do K_solver, work_ba_col, temp_data _apply_woodbury_correction_impl( K_solver, work_ba_col, temp_data, - mat.BA, mat.arc_susceptances, mat.valid_ix, monitored_idx, wf, + mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx, + monitored_idx, wf, ) end end diff --git a/test/PowerNetworkMatricesTests.jl b/test/PowerNetworkMatricesTests.jl index b3857fe30..d3036822c 100644 --- a/test/PowerNetworkMatricesTests.jl +++ b/test/PowerNetworkMatricesTests.jl @@ -21,7 +21,7 @@ import Aqua Aqua.test_unbound_args(PowerNetworkMatrices) Aqua.test_undefined_exports(PowerNetworkMatrices) Aqua.test_ambiguities(PowerNetworkMatrices) -Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:AppleAccelerate, :Pardiso]) +Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:Pardiso]) Aqua.test_deps_compat(PowerNetworkMatrices) Aqua.find_persistent_tasks_deps(PowerNetworkMatrices) Aqua.test_persistent_tasks(PowerNetworkMatrices) diff --git a/test/runtests.jl b/test/runtests.jl index 1afa42b7c..ae21ff04f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,12 +1,11 @@ -# load system-specific packages before PNM: ensures relevant PNM extensions will be loaded. +# Load system-specific packages before PNM so relevant PNM extensions +# precompile. The Apple Accelerate backend is now built in to PNM and does +# not need a trigger package; only the MKL/Pardiso extension is gated on a +# `using` call. @static if (Sys.ARCH === :x86_64 || Sys.ARCH === :i686) && !Sys.isapple() import Pkg Pkg.add("Pardiso") using Pardiso -else - import Pkg - Pkg.add("AppleAccelerate") - using AppleAccelerate end using PowerNetworkMatrices diff --git a/test/test_accelerate_wrapper.jl b/test/test_accelerate_wrapper.jl new file mode 100644 index 000000000..ad6d8f5a2 --- /dev/null +++ b/test/test_accelerate_wrapper.jl @@ -0,0 +1,188 @@ +import LinearAlgebra +import SparseArrays +import Random + +const _AA_TEST_SEED = 0xABCDEF + +function _random_spd(n::Int; density::Float64 = 0.05, scale::Real = 4) + M = SparseArrays.sprandn(n, n, density) + return SparseArrays.sparse(M + M' + scale * n * LinearAlgebra.I) +end + +@testset "AccelerateWrapper: smoke" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + + cache = PNM.AccelerateWrapper.aa_factorize(ABA) + @test PNM.AccelerateWrapper.is_factored(cache) + + b = randn(n) + x = copy(b) + PNM.AccelerateWrapper.solve!(cache, x) + @test isapprox(ABA * x, b; atol = 1e-9) + + B = SparseArrays.sprandn(n, 120, 0.03) + for j in 1:5:120 + B[:, j] .= 0 + end + SparseArrays.dropzeros!(B) + out = zeros(n, 120) + PNM.AccelerateWrapper.solve_sparse!(cache, B, out) + @test isapprox(out, ABA \ Matrix(B); atol = 1e-9) +end + +@testset "AccelerateWrapper: _create_factorization dispatch" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + + K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA) + K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA) + @test K_klu isa PNM.KLULinSolveCache{Float64} + @test K_aa isa PNM.AAFactorCache + + b_klu = randn(n) + b_aa = copy(b_klu) + PNM._solve_factorization(K_klu, b_klu) + PNM._solve_factorization(K_aa, b_aa) + @test isapprox(b_klu, b_aa; atol = 1e-9) +end + +@testset "AccelerateWrapper: with_solver resolves to concrete method" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + ABA = _random_spd(n) + K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA) + + work = [zeros(n)] + tmp = [zeros(n)] + lk = ReentrantLock() + b = randn(n) + copyto!(work[1], b) + + result = PNM.with_solver(K_aa, work, tmp, lk) do K, wba, _td + PNM._solve_factorization(K, wba) + return copy(wba) + end + @test isapprox(ABA * result, b; atol = 1e-9) + + m = which( + PNM.with_solver, + ( + typeof(identity), + PNM.AAFactorCache, + Vector{Vector{Float64}}, + Vector{Vector{Float64}}, + ReentrantLock, + ), + ) + @test Base.unwrap_unionall(m.sig).parameters[3] === PNM.AAFactorCache +end + +@testset "AccelerateWrapper: structural-symmetry check" begin + PNM._has_apple_accelerate_backend() || return + n = 8 + # Build a clearly non-symmetric matrix with a missing mirror entry. + A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n))) + A[2, 5] = 1.0 + SparseArrays.dropzeros!(A) + + @test_throws ArgumentError PNM.AAFactorCache(A) + @test_throws ArgumentError PNM.AccelerateWrapper.aa_factorize(A) + + # Bypass: caller asserts symmetry; AAFactorCache returns without throwing. + cache = PNM.AAFactorCache(A; check_symmetry = false) + @test cache isa PNM.AAFactorCache + + # symbolic_factor! must also honor the cache's check_symmetry flag. + Random.seed!(_AA_TEST_SEED) + ABA = _random_spd(n; density = 0.4) + cache_strict = PNM.AAFactorCache(ABA) + @test_throws ArgumentError PNM.AccelerateWrapper.symbolic_factor!(cache_strict, A) +end + +@testset "AccelerateWrapper: KLU vs AA per-column solve parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 200 + nbranches = 40 + ABA = _random_spd(n) + BA = SparseArrays.sprandn(n, nbranches, 0.03) + + K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA) + K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA) + + work_klu = zeros(n) + work_aa = zeros(n) + for col in 1:nbranches + work_klu .= Vector(BA[:, col]) + copyto!(work_aa, work_klu) + PNM._solve_factorization(K_klu, work_klu) + PNM._solve_factorization(K_aa, work_aa) + @test isapprox(work_klu, work_aa; atol = 1e-9) + end +end + +@testset "AccelerateWrapper: solve_w_refinement matches direct solve on AA cache" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED) + n = 60 + ABA = _random_spd(n) + cache = PNM.AccelerateWrapper.aa_factorize(ABA; check_symmetry = false) + + x_true = randn(n) + b = ABA * x_true + X = PNM.solve_w_refinement(cache, ABA, b) + @test isapprox(X, x_true; atol = 1e-10) +end + +@testset "AccelerateWrapper: solve_w_refinement! recovers ill-conditioned AA solve" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED + 1) + n = 80 + # Symmetric ill-conditioned tridiagonal — back-solve leaves residual that + # refinement closes. Less pathological than the KLU equivalent because + # AA's pivoted-LDLT has a higher pivot floor than KLU's BTF + LU on a + # nearly-defective tridiagonal. + A = SparseArrays.spdiagm( + 0 => fill(1.0, n), + 1 => fill(1.0 - 1e-6, n - 1), + -1 => fill(1.0 - 1e-6, n - 1), + ) + cache = PNM.AccelerateWrapper.aa_factorize(A; check_symmetry = false) + + x_true = randn(n) + b = A * x_true + X = zeros(n) + PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12) + @test isapprox(X, x_true; atol = 1e-8) +end + +@testset "AccelerateWrapper: solve_w_refinement requires a factored AA cache" begin + PNM._has_apple_accelerate_backend() || return + n = 12 + A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n))) + cache = PNM.AAFactorCache(A; check_symmetry = false) # not factored + b = randn(n) + @test_throws ErrorException PNM.solve_w_refinement(cache, A, b) +end + +@testset "AccelerateWrapper: solve_w_refinement KLU vs AA parity" begin + PNM._has_apple_accelerate_backend() || return + Random.seed!(_AA_TEST_SEED + 2) + n = 50 + A = _random_spd(n) + x_true = randn(n) + b = A * x_true + + K_klu = PNM.klu_factorize(A) + K_aa = PNM.AccelerateWrapper.aa_factorize(A; check_symmetry = false) + + X_klu = PNM.solve_w_refinement(K_klu, A, b) + X_aa = PNM.solve_w_refinement(K_aa, A, b) + @test isapprox(X_klu, X_aa; atol = 1e-10) +end diff --git a/test/test_klu_wrapper.jl b/test/test_klu_wrapper.jl index a4be6d48c..902c4a088 100644 --- a/test/test_klu_wrapper.jl +++ b/test/test_klu_wrapper.jl @@ -1,4 +1,6 @@ import SparseArrays +import LinearAlgebra +import Random @testset "KLU wrapper: real round-trip and refactor" begin n = 50 @@ -232,3 +234,168 @@ end alloc_warm = @allocated PNM.solve_sparse!(cache, B, out; block = 32) @test alloc_warm < n * size(B, 2) * sizeof(Float64) ÷ 4 end + +# --------------------------------------------------------------------------- +# Int32 index-type path +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: Int32 real round-trip and refactor" begin + n = 50 + rng_vals = collect(1.0:n) + A64 = SparseArrays.spdiagm(0 => rng_vals .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + x = collect(1.0:n) + b = A64 * x + + cache = PNM.klu_factorize(A) + @test PNM.is_factored(cache) + @test cache isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32} + @test size(cache) == (n, n) + + y = copy(b) + PNM.solve!(cache, y) + @test isapprox(y, x, atol = 1e-10) + + # Refactor with new values, same pattern. + A2_64 = SparseArrays.spdiagm(0 => rng_vals .+ 2.0, + 1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1)) + A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64) + x2 = randn(n) + b2 = A2_64 * x2 + PNM.numeric_refactor!(cache, A2) + y2 = copy(b2) + PNM.solve!(cache, y2) + @test isapprox(y2, x2, atol = 1e-9) +end + +@testset "KLU wrapper: Int32 vs Int64 solves agree bit-for-bit" begin + # KLU is deterministic; identical entries and identical permutation give + # identical solves regardless of the C entry point's index width. + Random.seed!(0xC0FFEE) + n = 200 + A64 = SparseArrays.spdiagm(0 => 4.0 .+ rand(n), 1 => 0.1 .* rand(n - 1), + -1 => 0.1 .* rand(n - 1)) + A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + + c64 = PNM.klu_factorize(A64) + c32 = PNM.klu_factorize(A32) + @test c64 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int64} + @test c32 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32} + + b = randn(n) + y64 = copy(b) + y32 = copy(b) + PNM.solve!(c64, y64) + PNM.solve!(c32, y32) + @test y64 == y32 # bit-equal +end + +@testset "KLU wrapper: Int32 solve_sparse! agrees with dense reference" begin + Random.seed!(0xBEEF) + n = 80 + A_dense = Matrix(SparseArrays.sprandn(n, n, 0.1)) + 4 * LinearAlgebra.I + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(SparseArrays.sparse(A_dense)) + cache = PNM.klu_factorize(A) + + B_dense = SparseArrays.sprandn(n, 20, 0.1) + B = SparseArrays.SparseMatrixCSC{Float64, Int32}(B_dense) + + out = PNM.solve_sparse(cache, B) + @test isapprox(out, A_dense \ Matrix(B_dense); atol = 1e-9) +end + +@testset "KLU wrapper: Int32 numeric_refactor! is allocation-free" begin + n = 100 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + cache = PNM.klu_factorize(A) + + A2_64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.5, + 1 => fill(0.07, n - 1), -1 => fill(0.07, n - 1)) + A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64) + + # Warm up specialization on the Int32 path. + PNM.numeric_refactor!(cache, A2) + alloc = @allocated PNM.numeric_refactor!(cache, A2) + @test alloc == 0 +end + +# --------------------------------------------------------------------------- +# Performance-knob surface +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: sort_factors!/condest!/rcond! work on both index types" begin + n = 60 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + + for cache in (PNM.klu_factorize(A64), PNM.klu_factorize(A32)) + PNM.KLUWrapper.sort_factors!(cache) + # Subsequent solve still returns the right answer. + b = randn(n) + y = copy(b) + PNM.solve!(cache, y) + @test isapprox(y, A64 \ b, atol = 1e-10) + + c = PNM.KLUWrapper.condest!(cache) + r = PNM.KLUWrapper.rcond!(cache) + @test c > 0 + @test 0 < r <= 1.0 + end +end + +# --------------------------------------------------------------------------- +# Iterative refinement +# --------------------------------------------------------------------------- + +@testset "KLU wrapper: solve_w_refinement matches direct solve on well-conditioned A" begin + Random.seed!(2) + n = 40 + A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1)) + cache = PNM.klu_factorize(A) + + x_true = randn(n) + b = A * x_true + X = PNM.solve_w_refinement(cache, A, b) + @test isapprox(X, x_true, atol = 1e-12) +end + +@testset "KLU wrapper: solve_w_refinement! recovers ill-conditioned solve" begin + # Build a noticeably ill-conditioned tridiagonal so a single back-solve + # leaves residual headroom that refinement actually closes. + Random.seed!(3) + n = 50 + A = SparseArrays.spdiagm(0 => fill(1.0, n), 1 => fill(1.0 - 1e-10, n - 1)) + cache = PNM.klu_factorize(A) + + x_true = randn(n) + b = A * x_true + X = zeros(n) + PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12) + @test isapprox(X, x_true, atol = 1e-8) +end + +@testset "KLU wrapper: solve_w_refinement requires a factored cache" begin + n = 10 + A = SparseArrays.spdiagm(0 => 1.0:n) + cache = PNM.KLUWrapper.KLULinSolveCache(A) # not factored + b = randn(n) + @test_throws ErrorException PNM.solve_w_refinement(cache, A, b) +end + +@testset "KLU wrapper: solve_w_refinement works on Int32 cache" begin + Random.seed!(4) + n = 30 + A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0, + 1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1)) + A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64) + cache = PNM.klu_factorize(A) + x_true = randn(n) + b = A64 * x_true + X = PNM.solve_w_refinement(cache, A, b) + @test isapprox(X, x_true, atol = 1e-10) +end diff --git a/test/test_lodf.jl b/test/test_lodf.jl index a5a52d614..bc7d4a6a0 100644 --- a/test/test_lodf.jl +++ b/test/test_lodf.jl @@ -56,7 +56,7 @@ @test isapprox(sum(total_error3), 0.0, atol = 1e-3) end - if PowerNetworkMatrices._has_apple_accelerate_ext() + if PowerNetworkMatrices._has_apple_accelerate_backend() L5NS_from_ptdf4 = LODF(A, P5; linear_solver = "AppleAccelerate") @test getindex(L5NS_from_ptdf4, "5", "6") - -0.3071 <= 1e-4 total_error4 = 0.0 diff --git a/test/test_network_modification.jl b/test/test_network_modification.jl index 7bdb278ad..2b158fd4a 100644 --- a/test/test_network_modification.jl +++ b/test/test_network_modification.jl @@ -258,8 +258,8 @@ end # serial baseline. The KLU path is exercised by the threaded testsets in # `test/test_virtual_modf.jl`; this complements that coverage on the # AppleAccelerate path. - if !PowerNetworkMatrices._has_apple_accelerate_ext() - @info "Skipping: AppleAccelerate extension not loaded." + if !PowerNetworkMatrices._has_apple_accelerate_backend() + @info "Skipping: AppleAccelerate backend not available on this platform." return end if Threads.nthreads() < 2 diff --git a/test/test_partial_lodf.jl b/test/test_partial_lodf.jl index 53363593f..e34eb8d0a 100644 --- a/test/test_partial_lodf.jl +++ b/test/test_partial_lodf.jl @@ -89,7 +89,9 @@ end @testset "Partial LODF: half-susceptance matches rebuilt ground truth" begin sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5") - vlodf = VirtualLODF(sys5) + # Pin KLU here: the ground-truth path below calls `PNM.solve!` directly on + # `vlodf.K`, which is dispatched on `KLULinSolveCache` only. + vlodf = VirtualLODF(sys5; linear_solver = "KLU") e = 1 # test arc index b_e = vlodf.arc_susceptances[e] diff --git a/test/test_powerflow_matrix_types.jl b/test/test_powerflow_matrix_types.jl index 870f650c5..33e634208 100644 --- a/test/test_powerflow_matrix_types.jl +++ b/test/test_powerflow_matrix_types.jl @@ -2,7 +2,7 @@ sys = PSB.build_system(PSB.PSITestSystems, "c_sys5") @testset "DC_ABA_Matrix_Factorized" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() M = ABA_Matrix(sys; factorize = true) @test M isa PNM.DC_ABA_Matrix_Factorized end @@ -14,7 +14,7 @@ end @testset "DC_PTDF_Matrix" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() @test PTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_PTDF_Matrix end @test PTDF(sys; linear_solver = "KLU") isa PNM.DC_PTDF_Matrix @@ -22,7 +22,7 @@ end @testset "DC_vPTDF_Matrix" begin - if PNM._has_apple_accelerate_ext() + if PNM._has_apple_accelerate_backend() @test VirtualPTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_vPTDF_Matrix end diff --git a/test/test_ptdf.jl b/test/test_ptdf.jl index 749cbf7a0..fb427491f 100644 --- a/test/test_ptdf.jl +++ b/test/test_ptdf.jl @@ -8,8 +8,8 @@ @info "Skipped MKLPardiso tests (extension not loaded)" continue end - if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate" - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate" + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" continue end sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5") diff --git a/test/test_virtual_lodf.jl b/test/test_virtual_lodf.jl index 1c70d05d6..ad4b46d58 100644 --- a/test/test_virtual_lodf.jl +++ b/test/test_virtual_lodf.jl @@ -147,3 +147,29 @@ end end @test test_value end + +@testset "Test Virtual LODF with Apple Accelerate" begin + if !PNM._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate VirtualLODF tests (backend unavailable on this platform)" + else + sys = PSB.build_system(PSB.PSITestSystems, "c_sys14") + + vlodf_aa = VirtualLODF(sys; linear_solver = "AppleAccelerate") + vlodf_klu = VirtualLODF(sys; linear_solver = "KLU") + + @test contains(string(typeof(vlodf_aa.K)), "AAFactorCache") + @test vlodf_klu.K isa PNM.KLULinSolveCache{Float64} + + arc_axis = PNM.get_arc_axis(vlodf_aa) + @test arc_axis == PNM.get_arc_axis(vlodf_klu) + for arc in arc_axis + row_aa = vlodf_aa[arc, :] + row_klu = vlodf_klu[arc, :] + @test isapprox(row_aa, row_klu, atol = 1e-9) + end + + # macOS default should resolve to AppleAccelerate. + vlodf_default = VirtualLODF(sys) + @test contains(string(typeof(vlodf_default.K)), "AAFactorCache") + end +end diff --git a/test/test_virtual_modf.jl b/test/test_virtual_modf.jl index 4dc9464c8..d97c72ae0 100644 --- a/test/test_virtual_modf.jl +++ b/test/test_virtual_modf.jl @@ -436,6 +436,39 @@ end end end +@testset "VirtualMODF with Apple Accelerate backend matches KLU" begin + if !PNM._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate VirtualMODF tests (backend unavailable on this platform)" + else + sys, _ = _build_c_sys14_with_outages() + + vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate") + vmodf_klu = VirtualMODF(sys; linear_solver = "KLU") + + # Factorization should be the AA cache type. + @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache") + @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64} + + registered_aa = get_registered_contingencies(vmodf_aa) + registered_klu = get_registered_contingencies(vmodf_klu) + @test !isempty(registered_aa) + @test keys(registered_aa) == keys(registered_klu) + + # Compare post-contingency rows for every registered contingency + # against the KLU build, sweeping all monitored arcs. + arc_axis = PNM.get_arc_axis(vmodf_aa) + @test arc_axis == PNM.get_arc_axis(vmodf_klu) + for (uuid, ctg_aa) in registered_aa + ctg_klu = registered_klu[uuid] + for arc in arc_axis + row_aa = vmodf_aa[arc, ctg_aa] + row_klu = vmodf_klu[arc, ctg_klu] + @test isapprox(row_aa, row_klu, atol = 1e-9) + end + end + end +end + @testset "VirtualMODF concurrent getindex on the SAME (arc, ctg) is consistent" begin # Complements the previous testset: there, each (arc, ctg) pair appears # once in the work list, so only the `woodbury_cache` first-call race is diff --git a/test/test_virtual_ptdf.jl b/test/test_virtual_ptdf.jl index 5993d4d03..83d7ebb9e 100644 --- a/test/test_virtual_ptdf.jl +++ b/test/test_virtual_ptdf.jl @@ -1,8 +1,8 @@ # if it fails, we don't want the terminal to be flooded with errors, therefore failfast=true @testset "Test Virtual PTDF matrices" for solver in ("KLU", "AppleAccelerate") - if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate" - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate" + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" continue end sys = PSB.build_system( @@ -185,8 +185,8 @@ end end @testset "Test Virtual PTDF with Apple Accelerate" begin - if !PowerNetworkMatrices._has_apple_accelerate_ext() - @info "Skipped AppleAccelerate tests (extension not loaded)" + if !PowerNetworkMatrices._has_apple_accelerate_backend() + @info "Skipped AppleAccelerate tests (backend unavailable on this platform)" else # Test VirtualPTDF with Apple Accelerate solver sys = PSB.build_system(PSB.PSITestSystems, "c_sys14") @@ -194,8 +194,8 @@ end # Create VirtualPTDF with AppleAccelerate vptdf_aa = VirtualPTDF(sys; linear_solver = "AppleAccelerate") - # Verify the factorization type is AAFactorization - @test contains(string(typeof(vptdf_aa.K)), "AAFactorization") + # Verify the factorization type is AAFactorCache + @test contains(string(typeof(vptdf_aa.K)), "AAFactorCache") # Create reference VirtualPTDF with KLU vptdf_klu = VirtualPTDF(sys; linear_solver = "KLU") @@ -209,7 +209,7 @@ end # Test with tolerance vptdf_aa_tol = VirtualPTDF(sys; linear_solver = "AppleAccelerate", tol = 1e-2) - @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorization") + @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorCache") # Test with distributed slack buscount = length(PSY.get_available_components(PSY.ACBus, sys)) @@ -217,7 +217,7 @@ end dist_slack = Dict(i => dist_slack_factor for i in 1:buscount) vptdf_aa_slack = VirtualPTDF(sys; linear_solver = "AppleAccelerate", dist_slack = dist_slack) - @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorization") + @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorCache") # Compare with KLU for distributed slack case vptdf_klu_slack = VirtualPTDF(sys; linear_solver = "KLU", dist_slack = dist_slack)