diff --git a/Project.toml b/Project.toml
index fcd1eee47..f853d714c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,15 +15,12 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SuiteSparse_jll = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
 
 [weakdeps]
-AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
 
 [extensions]
-AppleAccelerateExt = "AppleAccelerate"
 MKLPardisoExt = "Pardiso"
 
 [compat]
-AppleAccelerate = "^0.4"
 DataStructures = "^0.19"
 DocStringExtensions = "~0.8, ~0.9"
 HDF5 = "0.17"
diff --git a/docs/src/reference/internals.md b/docs/src/reference/internals.md
index cf5b63078..e3cc12b85 100644
--- a/docs/src/reference/internals.md
+++ b/docs/src/reference/internals.md
@@ -14,3 +14,14 @@ symbols are exported from `PowerNetworkMatrices`.
 ```@autodocs
 Modules = [PowerNetworkMatrices.KLUWrapper]
 ```
+
+## `AccelerateWrapper`
+
+`PowerNetworkMatrices.AccelerateWrapper` is a thin, allocation-aware wrapper over Apple's
+`libSparse.dylib` (provided by the system Accelerate framework) used internally for sparse
+linear solves on macOS. Non-Apple builds load stub fallbacks that throw on use. None of
+these symbols are exported from `PowerNetworkMatrices`.
+
+```@autodocs
+Modules = [PowerNetworkMatrices.AccelerateWrapper]
+```
diff --git a/ext/AppleAccelerateExt.jl b/ext/AppleAccelerateExt.jl
deleted file mode 100644
index 8b2e48345..000000000
--- a/ext/AppleAccelerateExt.jl
+++ /dev/null
@@ -1,112 +0,0 @@
-module AppleAccelerateExt
-
-import PowerNetworkMatrices as PNM
-using AppleAccelerate
-import SparseArrays
-import LinearAlgebra
-
-# Extend the factorization creation function
-function PNM._create_apple_accelerate_factorization(ABA)
-    K = AppleAccelerate.AAFactorization(ABA)
-    AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT)
-    return K
-end
-
-# Extend the solve function for AppleAccelerate factorizations
-function PNM._solve_factorization(
-    K::AppleAccelerate.AAFactorization{Float64},
-    b::Vector{Float64},
-)
-    return AppleAccelerate.solve(K, b)
-end
-
-"""
-Function for internal use only.
-
-Computes the PTDF matrix by means of AppleAccelerate for sparse matrices.
-
-# Arguments
-- `A::SparseArrays.SparseMatrixCSC{Int8, Int}`:
-        Incidence Matrix
-- `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
-        BA matrix
-- `ref_bus_positions::Set{Int}`:
-        vector containing the indexes of the reference slack buses.
-- `dist_slack::Vector{Float64}`:
-        vector containing the weights for the distributed slacks.
-"""
-function PNM._calculate_PTDF_matrix_AppleAccelerate(
-    A::SparseArrays.SparseMatrixCSC{Int8, Int},
-    BA::SparseArrays.SparseMatrixCSC{Float64, Int},
-    ref_bus_positions::Set{Int},
-    dist_slack::Vector{Float64})
-    @warn "AppleAccelerate solver is experimental and may produce unexpected results. If you need high confidence use KLU"
-    linecount = size(BA, 2)
-    buscount = size(BA, 1)
-
-    ABA = PNM.calculate_ABA_matrix(A, BA, ref_bus_positions)
-    K = AppleAccelerate.AAFactorization(ABA)
-    AppleAccelerate.factor!(K, AppleAccelerate.SparseFactorizationLDLT)
-
-    # initialize matrices for evaluation
-    valid_ix = setdiff(1:buscount, ref_bus_positions)
-    PTDFm_t = zeros(buscount, linecount)
-    copyto!(PTDFm_t, BA)
-    if !isempty(dist_slack) && length(ref_bus_positions) != 1
-        error(
-            "Distributed slack is not supported for systems with multiple reference buses.",
-        )
-    elseif isempty(dist_slack) && length(ref_bus_positions) < buscount
-        PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        return PTDFm_t
-    elseif length(dist_slack) == buscount
-        @info "Distributed bus"
-        PTDFm_t[valid_ix, :] = AppleAccelerate.solve(K, PTDFm_t[valid_ix, :])
-        PTDFm_t[collect(ref_bus_positions), :] .= 0.0
-        slack_array = dist_slack / sum(dist_slack)
-        slack_array = reshape(slack_array, 1, buscount)
-        return PTDFm_t .- (slack_array * PTDFm_t)
-    else
-        error("Distributed bus specification doesn't match the number of buses.")
-    end
-
-    return
-end
-
-"""
-Function for internal use only.
-
-Computes the LODF matrix by means of AppleAccelerate for sparse matrices.
-
-# Arguments
-- `a::SparseArrays.SparseMatrixCSC{Int8, Int}`:
-        Incidence Matrix
-- `ptdf::Matrix{Float64}`:
-        PTDF matrix
-"""
-function PNM._calculate_LODF_matrix_AppleAccelerate(
-    a::SparseArrays.SparseMatrixCSC{Int8, Int},
-    ptdf::Matrix{Float64},
-)
-    linecount = size(ptdf, 2)
-    ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    Dem_LU = AppleAccelerate.AAFactorization(SparseArrays.sparse(m_I, m_I, m_V))
-    lodf_t = AppleAccelerate.solve(Dem_LU, ptdf_denominator_t)
-    lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0
-
-    return lodf_t
-end
-
-end # module
diff --git a/ext/MKLPardisoExt.jl b/ext/MKLPardisoExt.jl
index ab9d02e04..1f124a47d 100644
--- a/ext/MKLPardisoExt.jl
+++ b/ext/MKLPardisoExt.jl
@@ -171,28 +171,15 @@ function PNM._calculate_LODF_matrix_MKLPardiso(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
     ptdf::Matrix{Float64},
 )
+    # The demand matrix `diag(1 - PTDF·A[i,i])` is diagonal, so the
+    # "solve" is a row-wise element-wise division. The Pardiso path that
+    # used to factor it and back-solve via `_pardiso_sequential_LODF!` /
+    # `_pardiso_single_LODF!` is no longer needed for this stage.
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < PNM.LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    lodf_t = zeros(linecount, linecount)
-    A = SparseArrays.sparse(m_I, m_I, m_V)
-    if linecount > PNM.DEFAULT_LODF_CHUNK_SIZE
-        PNM._pardiso_sequential_LODF!(lodf_t, A, ptdf_denominator_t)
-    else
-        PNM._pardiso_single_LODF!(lodf_t, A, ptdf_denominator_t)
-    end
-    lodf_t[LinearAlgebra.diagind(lodf_t)] .= -1.0
-    return lodf_t
+    m_V = PNM._build_lodf_demand(ptdf_denominator_t, linecount)
+    PNM._apply_lodf_demand!(ptdf_denominator_t, m_V)
+    return ptdf_denominator_t
 end
 
 end # module
diff --git a/src/AccelerateWrapper/AccelerateWrapper.jl b/src/AccelerateWrapper/AccelerateWrapper.jl
new file mode 100644
index 000000000..96d4f225a
--- /dev/null
+++ b/src/AccelerateWrapper/AccelerateWrapper.jl
@@ -0,0 +1,72 @@
+"""
+    AccelerateWrapper
+
+A small, allocation-aware wrapper over Apple's `libSparse.dylib`
+(`/System/Library/Frameworks/Accelerate.framework/.../libSparse.dylib`)
+designed for the access patterns of `PowerNetworkMatrices`:
+
+- Cache the symbolic and numeric factorizations of a symmetric sparse matrix
+  (LDLT by default) and reuse them across many solves.
+- Refresh the numeric factor (`numeric_refactor!`) while keeping the symbolic
+  analysis, without re-allocating the structural arrays.
+- Solve dense and **sparse** right-hand sides in place, with the sparse path
+  packing only non-empty RHS columns into a bounded scratch block.
+- Compute `A·X` and `A·x` directly via libSparse's `SparseMultiply`.
+
+This module is intentionally lighter than the upstream `AppleAccelerate.jl`
+package: it owns no high-level Julia wrappers over libSparse, exposes the
+symbolic/numeric split directly, binds only the entry points used by PNM,
+and is compile-gated to macOS so non-Apple builds never codegen the
+`@ccall` sites.
+"""
+module AccelerateWrapper
+
+import SparseArrays
+import SparseArrays: SparseMatrixCSC, getcolptr, rowvals, nonzeros, nzrange
+import LinearAlgebra
+
+export AAFactorCache,
+    aa_factorize,
+    symbolic_factor!,
+    numeric_refactor!,
+    full_factor!,
+    full_refactor!,
+    solve!,
+    solve_sparse!,
+    solve_sparse,
+    is_factored,
+    aa_spmm!,
+    aa_spmv!
+
+@static if Sys.isapple()
+    include("libsparse_bindings.jl")
+    include("aa_cache.jl")
+    include("solve_dense.jl")
+    include("solve_sparse_rhs.jl")
+    include("spmm.jl")
+else
+    # Stub layer. Non-Apple builds never bind libSparse symbols, never codegen
+    # the `@ccall` sites, and never instantiate `SparseOpaqueFactorization`.
+    # The whole submodule reduces to these short bodies on Linux/Windows.
+    struct AAFactorCache end
+
+    _unavailable() = error(
+        "AccelerateWrapper is macOS-only (Sys.isapple() returned false). " *
+        "Use the KLU backend on non-Apple platforms.",
+    )
+
+    AAFactorCache(args...; kwargs...) = _unavailable()
+    aa_factorize(args...; kwargs...) = _unavailable()
+    symbolic_factor!(args...) = _unavailable()
+    numeric_refactor!(args...) = _unavailable()
+    full_factor!(args...) = _unavailable()
+    full_refactor!(args...) = _unavailable()
+    solve!(args...) = _unavailable()
+    solve_sparse!(args...; kwargs...) = _unavailable()
+    solve_sparse(args...; kwargs...) = _unavailable()
+    is_factored(::AAFactorCache) = false
+    aa_spmm!(args...) = _unavailable()
+    aa_spmv!(args...) = _unavailable()
+end
+
+end # module
diff --git a/src/AccelerateWrapper/aa_cache.jl b/src/AccelerateWrapper/aa_cache.jl
new file mode 100644
index 000000000..3723c62d3
--- /dev/null
+++ b/src/AccelerateWrapper/aa_cache.jl
@@ -0,0 +1,409 @@
+"""
+A cached libSparse linear solver for repeated solves against the same
+symmetric sparse matrix structure. `numeric_refactor!` and `solve!` allocate
+nothing once the cache is built.
+
+Float64 only. The factor type defaults to `SparseFactorizationLDLT` (pivoted
+Bunch-Kaufman), which matches what the old `AppleAccelerateExt.jl` used.
+
+`reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis;
+`check_pattern` adds a structural-equality check on refactor calls and is
+only consulted when reusing. `check_symmetry` enforces structural symmetry of
+the input pattern on the analysis path (constructor and `symbolic_factor!`);
+disable only when the caller can guarantee symmetry by construction.
+"""
+mutable struct AAFactorCache
+    # Apple-side 0-based, narrower-integer copies of the input CSC pattern.
+    # Strip to lower triangle once during `symbolic_factor!`; subsequent
+    # `numeric_refactor!` calls reuse these arrays as-is.
+    columnStarts::Vector{Clong}
+    rowIndices::Vector{Cint}
+    nzval::Vector{Cdouble}
+    n::Int
+    nnz_tri::Int
+    factorization_type::SparseFactorization_t
+    symbolic::SparseOpaqueSymbolicFactorization
+    numeric::SparseOpaqueFactorization_t
+    reuse_symbolic::Bool
+    check_pattern::Bool
+    check_symmetry::Bool
+    # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call.
+    scratch::Matrix{Cdouble}
+    col_map::Vector{Int}
+end
+
+@inline _dim(cache::AAFactorCache) = cache.n
+
+Base.size(cache::AAFactorCache) = (cache.n, cache.n)
+Base.size(cache::AAFactorCache, d::Integer) = d <= 2 ? cache.n : 1
+Base.eltype(::Type{AAFactorCache}) = Cdouble
+
+"""
+    is_factored(cache::AAFactorCache) -> Bool
+
+`true` when `cache` holds a valid numeric factorization ready for `solve!` /
+`solve_sparse!`. `false` after construction (before `full_factor!`) or after
+the libSparse handles have been finalized.
+"""
+function is_factored(cache::AAFactorCache)
+    return cache.numeric.status == SparseStatusOk && cache.symbolic.status == SparseStatusOk
+end
+
+"""
+    AAFactorCache(A; reuse_symbolic=true, check_pattern=true,
+                  check_symmetry=true,
+                  factorization_type=SparseFactorizationLDLT)
+
+Build a cache for the symmetric sparse matrix `A`. Allocates the Apple-side
+structural arrays (`columnStarts`, `rowIndices`, `nzval`) sized to the lower
+triangle of `A` but does **not** factorize. Call `full_factor!` (or
+`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
+
+A finalizer frees libSparse handles on GC; call `Base.finalize(cache)` to
+release them eagerly.
+
+`check_symmetry=true` (default) verifies that `A`'s nonzero pattern is
+structurally symmetric on construction and on each `symbolic_factor!`. Pass
+`check_symmetry=false` only when the caller can guarantee symmetry by
+construction (e.g. `ABA = Aᵀ B A`).
+"""
+function AAFactorCache(
+    A::SparseMatrixCSC{Float64, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+    check_symmetry::Bool = true,
+    factorization_type::SparseFactorization_t = SparseFactorizationLDLT,
+)
+    n = size(A, 1)
+    n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))"))
+    if check_symmetry
+        _assert_structural_symmetry(A)
+    end
+
+    nnz_tri = _count_lower_triangle(A)
+    cache = AAFactorCache(
+        Vector{Clong}(undef, n + 1),
+        Vector{Cint}(undef, nnz_tri),
+        Vector{Cdouble}(undef, 0),
+        n,
+        nnz_tri,
+        factorization_type,
+        _null_symbolic(),
+        _null_factorization(),
+        reuse_symbolic,
+        check_pattern,
+        check_symmetry,
+        Matrix{Cdouble}(undef, 0, 0),
+        Int[],
+    )
+    _populate_lower_triangle_pattern!(cache, A)
+    finalizer(_free_handles!, cache)
+    return cache
+end
+
+# Structural-symmetry check: for every nonzero at (i, j) with i != j, require
+# a mirror nonzero at (j, i). Throws on first violation. O(nnz · avg-col-len).
+function _assert_structural_symmetry(A::SparseMatrixCSC{Float64, Int})
+    n = size(A, 1)
+    rowval = rowvals(A)
+    @inbounds for j in 1:n
+        for p in nzrange(A, j)
+            i = rowval[p]
+            i == j && continue
+            mirror = false
+            for q in nzrange(A, i)
+                if rowval[q] == j
+                    mirror = true
+                    break
+                end
+            end
+            if !mirror
+                throw(
+                    ArgumentError(
+                        "AAFactorCache: input matrix is not structurally symmetric " *
+                        "(nonzero at ($i, $j) has no mirror at ($j, $i)). " *
+                        "Pass `check_symmetry=false` to bypass if the caller " *
+                        "guarantees symmetry by construction.",
+                    ),
+                )
+            end
+        end
+    end
+    return nothing
+end
+
+# Count nonzeros in the lower triangle (i >= j) of A.
+function _count_lower_triangle(A::SparseMatrixCSC{Float64, Int})
+    cnt = 0
+    rowval = rowvals(A)
+    @inbounds for j in 1:size(A, 2)
+        for p in nzrange(A, j)
+            rowval[p] >= j && (cnt += 1)
+        end
+    end
+    return cnt
+end
+
+# Strip A to its lower triangle and fill `cache.columnStarts` / `cache.rowIndices`
+# with 0-based indices. Caller must have sized these to (n+1) and nnz_tri.
+function _populate_lower_triangle_pattern!(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+)
+    rowval = rowvals(A)
+    n = cache.n
+    pos = 0
+    cache.columnStarts[1] = 0
+    @inbounds for j in 1:n
+        for p in nzrange(A, j)
+            if rowval[p] >= j
+                pos += 1
+                cache.rowIndices[pos] = Cint(rowval[p] - 1)
+            end
+        end
+        cache.columnStarts[j + 1] = Clong(pos)
+    end
+    return cache
+end
+
+# Snapshot the lower-triangle values into `cache.nzval`, growing if needed.
+function _populate_lower_triangle_values!(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+)
+    if length(cache.nzval) != cache.nnz_tri
+        resize!(cache.nzval, cache.nnz_tri)
+    end
+    rowval = rowvals(A)
+    nzv = nonzeros(A)
+    pos = 0
+    @inbounds for j in 1:size(A, 2)
+        for p in nzrange(A, j)
+            if rowval[p] >= j
+                pos += 1
+                cache.nzval[pos] = nzv[p]
+            end
+        end
+    end
+    return cache
+end
+
+# Pattern-match guard for `numeric_refactor!`: assert the incoming CSC's
+# lower-triangle pattern is identical to what was analyzed. Same shape as
+# KLU's `_check_pattern_match`, just without the off-by-one increment dance
+# (Apple's arrays are 0-based but we always store them 0-based — no flipping).
+function _check_pattern_match(
+    cache::AAFactorCache,
+    A::SparseMatrixCSC{Float64, Int},
+    op::AbstractString,
+)
+    n = cache.n
+    if size(A, 1) != n || size(A, 2) != n
+        throw(DimensionMismatch("Cannot $op: cache is $(n)×$(n) but A is $(size(A))."))
+    end
+    rowval = rowvals(A)
+    pos = 0
+    @inbounds for j in 1:n
+        cache.columnStarts[j] == Clong(pos) || return _pattern_mismatch(op)
+        for p in nzrange(A, j)
+            if rowval[p] >= j
+                pos += 1
+                pos > cache.nnz_tri && return _pattern_mismatch(op)
+                cache.rowIndices[pos] == Cint(rowval[p] - 1) || return _pattern_mismatch(op)
+            end
+        end
+        cache.columnStarts[j + 1] == Clong(pos) || return _pattern_mismatch(op)
+    end
+    pos == cache.nnz_tri || return _pattern_mismatch(op)
+    return nothing
+end
+
+_pattern_mismatch(op::AbstractString) =
+    throw(ArgumentError("Cannot $op: matrix has different sparsity structure."))
+
+"""
+Release the libSparse numeric and symbolic handles held by `cache`, leaving
+Julia-side state intact. Idempotent.
+"""
+function _free_handles!(cache::AAFactorCache)
+    if cache.numeric.status == SparseStatusOk
+        _sparse_cleanup_factor!(cache.numeric)
+        cache.numeric = _null_factorization()
+    end
+    if cache.symbolic.status == SparseStatusOk
+        _sparse_cleanup_symbolic!(cache.symbolic)
+        cache.symbolic = _null_symbolic()
+    end
+    return nothing
+end
+
+Base.finalize(cache::AAFactorCache) = _free_handles!(cache)
+
+# Build the libSparse `SparseMatrixStructure` view that points into the
+# cache's owned arrays. Marked symmetric + lower triangle so the
+# `SparseFactorizationLDLT` path treats `cache.nzval` as the lower-triangle
+# entries of a symmetric matrix.
+function _structure_view(cache::AAFactorCache)
+    return SparseMatrixStructure(
+        Cint(cache.n),
+        Cint(cache.n),
+        pointer(cache.columnStarts),
+        pointer(cache.rowIndices),
+        ATT_SYMMETRIC | ATT_LOWER_TRIANGLE,
+        UInt8(1),
+    )
+end
+
+function _matrix_view(cache::AAFactorCache)
+    return SparseMatrix_t(_structure_view(cache), pointer(cache.nzval))
+end
+
+"""
+    symbolic_factor!(cache, A)
+
+Free any cached symbolic/numeric factor, replace the structural arrays with
+`A`'s lower-triangle pattern, and analyze. Subsequent `numeric_refactor!`
+calls reuse the analysis.
+"""
+function symbolic_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    n = cache.n
+    if size(A, 1) != n || size(A, 2) != n
+        throw(DimensionMismatch("Cannot factor: cache is $(n)×$(n) but A is $(size(A))."))
+    end
+    if cache.check_symmetry
+        _assert_structural_symmetry(A)
+    end
+    _free_handles!(cache)
+    new_nnz_tri = _count_lower_triangle(A)
+    if new_nnz_tri != cache.nnz_tri
+        resize!(cache.rowIndices, new_nnz_tri)
+        cache.nnz_tri = new_nnz_tri
+    end
+    if length(cache.columnStarts) != n + 1
+        resize!(cache.columnStarts, n + 1)
+    end
+    _populate_lower_triangle_pattern!(cache, A)
+    sym = _sparse_symbolic_factor(
+        cache.factorization_type,
+        _structure_view(cache),
+        SparseSymbolicFactorOptions(),
+    )
+    if sym.status != SparseStatusOk
+        # libSparse may have allocated C-side state before deciding to fail —
+        # release it before throwing so we don't leak per failed factor.
+        _sparse_cleanup_symbolic!(sym)
+        _libsparse_throw(sym.status, "symbolic factor")
+    end
+    cache.symbolic = sym
+    return cache
+end
+
+"""
+    numeric_refactor!(cache, A)
+
+Refresh the numeric factor on top of the existing symbolic analysis. Errors
+if `symbolic_factor!` has not been called yet.
+"""
+function numeric_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    cache.symbolic.status == SparseStatusOk ||
+        error("AAFactorCache: call symbolic_factor! before numeric_refactor!.")
+    cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor")
+    _populate_lower_triangle_values!(cache, A)
+    if cache.numeric.status == SparseStatusOk
+        _sparse_cleanup_factor!(cache.numeric)
+        cache.numeric = _null_factorization()
+    end
+    num = _sparse_numeric_factor(
+        cache.symbolic,
+        _matrix_view(cache),
+        SparseNumericFactorOptions(),
+    )
+    if num.status != SparseStatusOk
+        # Same rationale as in symbolic_factor!: release before throwing.
+        _sparse_cleanup_factor!(num)
+        _libsparse_throw(num.status, "numeric factor")
+    end
+    cache.numeric = num
+    return cache
+end
+
+"""
+    symbolic_refactor!(cache, A)
+
+If `cache.reuse_symbolic`, optionally verify the structure matches and reuse
+the existing analysis. Otherwise, rerun `symbolic_factor!`.
+"""
+function symbolic_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    if !cache.reuse_symbolic
+        return symbolic_factor!(cache, A)
+    end
+    cache.check_pattern && _check_pattern_match(cache, A, "symbolic_refactor")
+    return cache
+end
+
+"""
+    full_factor!(cache, A) -> cache
+
+Run a fresh symbolic analysis followed by a numeric factorization on `A`.
+"""
+function full_factor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    symbolic_factor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    full_refactor!(cache, A) -> cache
+
+Refresh both factorizations on `A`. Defers to `symbolic_refactor!` (which
+reuses the existing analysis when `cache.reuse_symbolic` is set) followed by
+`numeric_refactor!`.
+"""
+function full_refactor!(cache::AAFactorCache, A::SparseMatrixCSC{Float64, Int})
+    symbolic_refactor!(cache, A)
+    numeric_refactor!(cache, A)
+    return cache
+end
+
+"""
+    aa_factorize(A; reuse_symbolic=true, check_pattern=true,
+                  check_symmetry=true,
+                  factorization_type=SparseFactorizationLDLT) -> AAFactorCache
+
+Build a cache for `A` and immediately compute the full factorization. See
+`AAFactorCache` for the kwarg semantics, including `check_symmetry`.
+"""
+function aa_factorize(
+    A::SparseMatrixCSC{Float64, Int};
+    reuse_symbolic::Bool = true,
+    check_pattern::Bool = true,
+    check_symmetry::Bool = true,
+    factorization_type::SparseFactorization_t = SparseFactorizationLDLT,
+)
+    cache = AAFactorCache(
+        A;
+        reuse_symbolic = reuse_symbolic,
+        check_pattern = check_pattern,
+        check_symmetry = check_symmetry,
+        factorization_type = factorization_type,
+    )
+    return full_factor!(cache, A)
+end
+
+"""
+    _ensure_scratch!(cache, block) -> Nothing
+
+Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length
+`block`. Used by `solve_sparse!`.
+"""
+@inline function _ensure_scratch!(cache::AAFactorCache, block::Int)
+    n = cache.n
+    s = cache.scratch
+    if size(s, 1) != n || size(s, 2) < block
+        cache.scratch = Matrix{Cdouble}(undef, n, block)
+    end
+    if length(cache.col_map) < block
+        resize!(cache.col_map, block)
+    end
+    return nothing
+end
diff --git a/src/AccelerateWrapper/libsparse_bindings.jl b/src/AccelerateWrapper/libsparse_bindings.jl
new file mode 100644
index 000000000..1c85491e4
--- /dev/null
+++ b/src/AccelerateWrapper/libsparse_bindings.jl
@@ -0,0 +1,320 @@
+# Direct bindings into Apple's libSparse.dylib (part of the Accelerate
+# framework). Only the entry points actually consumed by PowerNetworkMatrices
+# are wrapped — Float64-only, no Float32 mangled aliases, no QR / Cholesky-AtA
+# variants. Mangled names match what AppleAccelerate.jl uses; see
+# `/Library/Developer/CommandLineTools/SDKs/MacOSX*.sdk/System/Library/
+# Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/
+# Versions/A/Headers/Sparse/Solve.h` for the C declarations.
+
+const LIBSPARSE =
+    "/System/Library/Frameworks/Accelerate.framework/Versions/A/" *
+    "Frameworks/vecLib.framework/libSparse.dylib"
+
+@enum SparseFactorization_t::UInt8 begin
+    SparseFactorizationCholesky = 0
+    SparseFactorizationLDLT = 1
+    SparseFactorizationLDLTUnpivoted = 2
+    SparseFactorizationLDLTSBK = 3
+    SparseFactorizationLDLTTPP = 4
+end
+
+@enum SparseOrder_t::UInt8 begin
+    SparseOrderDefault = 0
+    SparseOrderUser = 1
+    SparseOrderAMD = 2
+    SparseOrderMetis = 3
+    SparseOrderCOLAMD = 4
+end
+
+@enum SparseScaling_t::UInt8 begin
+    SparseScalingDefault = 0
+    SparseScalingUser = 1
+    SparseScalingEquilibriationInf = 2
+end
+
+@enum SparseStatus_t::Int32 begin
+    SparseStatusOk = 0
+    SparseStatusFailed = -1
+    SparseMatrixIsSingular = -2
+    SparseInternalError = -3
+    SparseParameterError = -4
+    SparseStatusReleased = -2147483647
+end
+
+@enum SparseControl_t::UInt32 begin
+    SparseDefaultControl = 0
+end
+
+# `SparseAttributes_t` is a packed bitfield in C; Julia can't express that
+# cleanly, so we model it as `Cuint` and assemble the bits ourselves.
+const att_type = Cuint
+const ATT_TRANSPOSE = att_type(1)
+const ATT_UPPER_TRIANGLE = att_type(0)
+const ATT_LOWER_TRIANGLE = att_type(2)
+const ATT_ORDINARY = att_type(0)
+const ATT_TRIANGULAR = att_type(4)
+const ATT_UNIT_TRIANGULAR = att_type(8)
+const ATT_SYMMETRIC = att_type(12)
+
+struct SparseMatrixStructure
+    rowCount::Cint
+    columnCount::Cint
+    columnStarts::Ptr{Clong}
+    rowIndices::Ptr{Cint}
+    attributes::att_type
+    blockSize::UInt8
+end
+
+struct SparseNumericFactorOptions
+    control::SparseControl_t
+    scalingMethod::SparseScaling_t
+    scaling::Ptr{Cvoid}
+    pivotTolerance::Float64
+    zeroTolerance::Float64
+end
+
+# Defaults match Apple's SolveImplementation.h.
+SparseNumericFactorOptions() = SparseNumericFactorOptions(
+    SparseDefaultControl,
+    SparseScalingDefault,
+    C_NULL,
+    0.01,
+    eps(Cdouble) * 1e-4,
+)
+
+struct SparseSymbolicFactorOptions
+    control::SparseControl_t
+    orderMethod::SparseOrder_t
+    order::Ptr{Cvoid}
+    ignoreRowsAndColumns::Ptr{Cvoid}
+    malloc::Ptr{Cvoid}
+    free::Ptr{Cvoid}
+    reportError::Ptr{Cvoid}
+end
+
+# `reportError` is fired by libSparse before it returns a failure status. The
+# inline `error(unsafe_string(text))` matches AppleAccelerate.jl's pattern: it
+# propagates the libSparse message as a Julia exception that unwinds back
+# through the originating ccall. Avoids `@error`'s allocator/logger overhead,
+# which is fragile when libSparse invokes the callback from its own threads.
+# Passing libc malloc/free explicitly (the C_NULL "use Apple defaults" path is
+# documented but unreliable from a non-Obj-C caller).
+function SparseSymbolicFactorOptions()
+    return SparseSymbolicFactorOptions(
+        SparseDefaultControl,
+        SparseOrderDefault,
+        C_NULL,
+        C_NULL,
+        @cfunction(Libc.malloc, Ptr{Cvoid}, (Csize_t,)),
+        @cfunction(Libc.free, Cvoid, (Ptr{Cvoid},)),
+        @cfunction(text -> error(unsafe_string(text)), Cvoid, (Cstring,)),
+    )
+end
+
+struct DenseVector_t
+    count::Cint
+    data::Ptr{Cdouble}
+end
+
+struct DenseMatrix_t
+    rowCount::Cint
+    columnCount::Cint
+    columnStride::Cint
+    attributes::att_type
+    data::Ptr{Cdouble}
+end
+
+struct SparseMatrix_t
+    structure::SparseMatrixStructure
+    data::Ptr{Cdouble}
+end
+
+struct SparseOpaqueSymbolicFactorization
+    status::SparseStatus_t
+    rowCount::Cint
+    columnCount::Cint
+    attributes::att_type
+    blockSize::UInt8
+    type::SparseFactorization_t
+    factorization::Ptr{Cvoid}
+    workspaceSize_Float::Csize_t
+    workspaceSize_Double::Csize_t
+    factorSize_Float::Csize_t
+    factorSize_Double::Csize_t
+end
+
+# Placeholder used to initialize `AAFactorCache.symbolic` before the first
+# factor. Marked released so any accidental cleanup is a no-op.
+function _null_symbolic()
+    return SparseOpaqueSymbolicFactorization(
+        SparseStatusReleased,
+        0,
+        0,
+        ATT_ORDINARY,
+        0,
+        SparseFactorizationLDLT,
+        C_NULL,
+        0,
+        0,
+        0,
+        0,
+    )
+end
+
+struct SparseOpaqueFactorization_t
+    status::SparseStatus_t
+    attributes::att_type
+    symbolicFactorization::SparseOpaqueSymbolicFactorization
+    userFactorStorage::Bool
+    numericFactorization::Ptr{Cvoid}
+    solveWorkspaceRequiredStatic::Csize_t
+    solveWorkspaceRequiredPerRHS::Csize_t
+end
+
+function _null_factorization()
+    return SparseOpaqueFactorization_t(
+        SparseStatusReleased,
+        ATT_ORDINARY,
+        _null_symbolic(),
+        false,
+        C_NULL,
+        0,
+        0,
+    )
+end
+
+# Build the Apple-side dense views at the ccall boundary. `StridedMatrix`'s
+# first-dimension stride must be 1 (the libSparse contract); we assert at the
+# call site, not here.
+function _dense_matrix(B::StridedMatrix{Cdouble})
+    return DenseMatrix_t(
+        Cint(size(B, 1)),
+        Cint(size(B, 2)),
+        Cint(stride(B, 2)),
+        ATT_ORDINARY,
+        pointer(B),
+    )
+end
+
+function _dense_vector(b::StridedVector{Cdouble})
+    return DenseVector_t(Cint(length(b)), pointer(b))
+end
+
+# --- ccalls -----------------------------------------------------------------
+#
+# Mangled symbol names come from the C++ ABI of libSparse. They are stable on
+# the system framework and match what AppleAccelerate.jl binds. If Apple
+# breaks them in a future macOS release, the failure will be loud (dlopen of
+# a missing symbol at first call), which is the behavior we want.
+
+# Symbolic-only factor: analyzes the pattern, returns an opaque symbolic
+# factor. Can back many numeric factors on the same pattern.
+function _sparse_symbolic_factor(
+    ftype::SparseFactorization_t,
+    structure::SparseMatrixStructure,
+    sym_opts::SparseSymbolicFactorOptions,
+)::SparseOpaqueSymbolicFactorization
+    return @ccall LIBSPARSE._Z12SparseFactorh21SparseMatrixStructure27SparseSymbolicFactorOptions(
+        ftype::Cuint,
+        structure::SparseMatrixStructure,
+        sym_opts::SparseSymbolicFactorOptions,
+    )::SparseOpaqueSymbolicFactorization
+end
+
+# Numeric factor on top of an existing symbolic factor. Reusable: the
+# symbolic handle is not consumed.
+function _sparse_numeric_factor(
+    symbolic::SparseOpaqueSymbolicFactorization,
+    matrix::SparseMatrix_t,
+    num_opts::SparseNumericFactorOptions,
+)::SparseOpaqueFactorization_t
+    return @ccall LIBSPARSE._Z12SparseFactor33SparseOpaqueSymbolicFactorization19SparseMatrix_Double26SparseNumericFactorOptions(
+        symbolic::SparseOpaqueSymbolicFactorization,
+        matrix::SparseMatrix_t,
+        num_opts::SparseNumericFactorOptions,
+    )::SparseOpaqueFactorization_t
+end
+
+# In-place solve `A · X = B`, where B is a column-major dense matrix and X
+# overwrites B's storage.
+function _sparse_solve_matrix!(
+    factor::SparseOpaqueFactorization_t,
+    B::DenseMatrix_t,
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseMatrix_Double(
+        factor::SparseOpaqueFactorization_t,
+        B::DenseMatrix_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_solve_vector!(
+    factor::SparseOpaqueFactorization_t,
+    b::DenseVector_t,
+)
+    @ccall LIBSPARSE._Z11SparseSolve32SparseOpaqueFactorization_Double18DenseVector_Double(
+        factor::SparseOpaqueFactorization_t,
+        b::DenseVector_t,
+    )::Cvoid
+    return nothing
+end
+
+# `Y = A · X`, dense multi-column. `Y` must be allocated to (rowCount, ncols)
+# by the caller. libSparse overwrites — does not accumulate.
+function _sparse_multiply_matrix!(
+    A::SparseMatrix_t,
+    X::DenseMatrix_t,
+    Y::DenseMatrix_t,
+)
+    @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseMatrix_DoubleS0_(
+        A::SparseMatrix_t,
+        X::DenseMatrix_t,
+        Y::DenseMatrix_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_multiply_vector!(
+    A::SparseMatrix_t,
+    x::DenseVector_t,
+    y::DenseVector_t,
+)
+    @ccall LIBSPARSE._Z14SparseMultiply19SparseMatrix_Double18DenseVector_DoubleS0_(
+        A::SparseMatrix_t,
+        x::DenseVector_t,
+        y::DenseVector_t,
+    )::Cvoid
+    return nothing
+end
+
+# Frees the libSparse-side numeric / symbolic storage attached to an opaque
+# factor. Idempotent: a second call with a `SparseStatusReleased` handle is
+# a no-op on libSparse's side.
+function _sparse_cleanup_factor!(factor::SparseOpaqueFactorization_t)
+    @ccall LIBSPARSE._Z13SparseCleanup32SparseOpaqueFactorization_Double(
+        factor::SparseOpaqueFactorization_t,
+    )::Cvoid
+    return nothing
+end
+
+function _sparse_cleanup_symbolic!(symbolic::SparseOpaqueSymbolicFactorization)
+    @ccall LIBSPARSE._Z13SparseCleanup33SparseOpaqueSymbolicFactorization(
+        symbolic::SparseOpaqueSymbolicFactorization,
+    )::Cvoid
+    return nothing
+end
+
+# Translate libSparse status codes into Julia exceptions. Singular and
+# parameter-error are the most common; the rest fall through to a generic
+# `error`.
+function _libsparse_throw(status::SparseStatus_t, op::AbstractString)
+    status == SparseMatrixIsSingular &&
+        throw(LinearAlgebra.SingularException(0))
+    status == SparseParameterError &&
+        throw(ArgumentError("libSparse $(op) failed: parameter error"))
+    status == SparseInternalError &&
+        error("libSparse $(op) failed: internal error")
+    status == SparseStatusFailed &&
+        error("libSparse $(op) failed")
+    return error("libSparse $(op) failed: status=$(Int(status))")
+end
diff --git a/src/AccelerateWrapper/solve_dense.jl b/src/AccelerateWrapper/solve_dense.jl
new file mode 100644
index 000000000..dbf09d3a4
--- /dev/null
+++ b/src/AccelerateWrapper/solve_dense.jl
@@ -0,0 +1,40 @@
+"""
+    solve!(cache, B) -> B
+
+Solve `A · X = B` in place, dispatching on the shape of `B`:
+
+  - `B::StridedMatrix{Cdouble}`: multiple right-hand sides handled in a single
+    libSparse call; `size(B, 1)` must equal `cache.n`.
+  - `B::StridedVector{Cdouble}`: single right-hand side; `length(B)` must
+    equal `cache.n`.
+
+Both overloads require `B` to have unit stride in the first dimension and
+the cache to be factored (`is_factored(cache) == true`).
+"""
+function solve!(cache::AAFactorCache, B::StridedMatrix{Cdouble})
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    n = cache.n
+    size(B, 1) == n ||
+        throw(DimensionMismatch("size(B, 1) = $(size(B, 1)), cache n = $(n)"))
+    stride(B, 1) == 1 ||
+        throw(ArgumentError("B must have unit stride in the first dimension."))
+    size(B, 2) == 0 && return B
+    _sparse_solve_matrix!(cache.numeric, _dense_matrix(B))
+    return B
+end
+
+function solve!(cache::AAFactorCache, b::StridedVector{Cdouble})
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    n = cache.n
+    length(b) == n || throw(DimensionMismatch("length(b) = $(length(b)), cache n = $(n)"))
+    stride(b, 1) == 1 || throw(ArgumentError("b must have unit stride."))
+    _sparse_solve_vector!(cache.numeric, _dense_vector(b))
+    return b
+end
+
+"""
+    \\(cache::AAFactorCache, B) -> X
+
+Allocating solve, mirroring `LinearAlgebra.Factorization`'s API.
+"""
+Base.:\(cache::AAFactorCache, B::StridedVecOrMat{Cdouble}) = solve!(cache, copy(B))
diff --git a/src/AccelerateWrapper/solve_sparse_rhs.jl b/src/AccelerateWrapper/solve_sparse_rhs.jl
new file mode 100644
index 000000000..61edf9772
--- /dev/null
+++ b/src/AccelerateWrapper/solve_sparse_rhs.jl
@@ -0,0 +1,97 @@
+const SPARSE_RHS_DEFAULT_BLOCK = 64
+
+"""
+    solve_sparse!(cache, B, out; block=$(SPARSE_RHS_DEFAULT_BLOCK)) -> out
+
+Solve `A · X = B` for a `SparseMatrixCSC` right-hand side, writing the
+result into `out`. Empty columns of `B` are not solved — `out`'s
+corresponding columns are zeroed. Non-empty columns within each chunk of
+`block` consecutive RHS columns are packed into a dense scratch and solved
+in a single libSparse call.
+
+The `block` chunk size bounds the working set so that processing an
+`n × nrhs` sparse RHS requires only `O(n · block)` extra memory regardless
+of `nrhs`. The cache reuses its packing buffer across calls; warm calls
+allocate nothing in the solver.
+
+Not thread-safe (mutates per-cache scratch).
+"""
+function solve_sparse!(
+    cache::AAFactorCache,
+    B::SparseMatrixCSC{<:Number, Int},
+    out::AbstractMatrix{Cdouble};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+)
+    is_factored(cache) || error("AAFactorCache: not factored yet.")
+    block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)"))
+    n = cache.n
+    size(B, 1) == n || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    ))
+    size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch(
+        "out has size $(size(out)); expected $((n, size(B, 2))).",
+    ))
+
+    nb = size(B, 2)
+    nb == 0 && return out
+    fill!(out, zero(Cdouble))
+
+    Browval = rowvals(B)
+    Bnzval = nonzeros(B)
+
+    _ensure_scratch!(cache, block)
+    scratch = cache.scratch
+    col_map = cache.col_map
+
+    j_start = 1
+    @inbounds while j_start <= nb
+        j_end = min(j_start + block - 1, nb)
+
+        npack = 0
+        for j in j_start:j_end
+            rng = nzrange(B, j)
+            isempty(rng) && continue
+            npack += 1
+            col_map[npack] = j
+            fill!(view(scratch, :, npack), zero(Cdouble))
+            for p in rng
+                scratch[Browval[p], npack] = Bnzval[p]
+            end
+        end
+
+        if npack > 0
+            # Build a DenseMatrix view over scratch[:, 1:npack]. Column stride
+            # is `size(scratch, 1) = n` because scratch is column-major and
+            # full-height.
+            dm = DenseMatrix_t(
+                Cint(n),
+                Cint(npack),
+                Cint(size(scratch, 1)),
+                ATT_ORDINARY,
+                pointer(scratch),
+            )
+            _sparse_solve_matrix!(cache.numeric, dm)
+
+            for k in 1:npack
+                copyto!(view(out, :, col_map[k]), view(scratch, :, k))
+            end
+        end
+
+        j_start = j_end + 1
+    end
+    return out
+end
+
+"""Allocating wrapper around `solve_sparse!`."""
+function solve_sparse(
+    cache::AAFactorCache,
+    B::SparseMatrixCSC{<:Number, Int};
+    block::Int = SPARSE_RHS_DEFAULT_BLOCK,
+)
+    return solve_sparse!(
+        cache,
+        B,
+        Matrix{Cdouble}(undef, cache.n, size(B, 2));
+        block = block,
+    )
+end
diff --git a/src/AccelerateWrapper/spmm.jl b/src/AccelerateWrapper/spmm.jl
new file mode 100644
index 000000000..e1c5375ca
--- /dev/null
+++ b/src/AccelerateWrapper/spmm.jl
@@ -0,0 +1,105 @@
+# SparseMultiply bindings. These do not require a factorization cache; they
+# take a `SparseMatrixCSC{Float64, Int}` directly and build the Apple-side
+# `SparseMatrix_t` view at the ccall boundary. Per call this allocates two
+# transient arrays (0-based `Clong[]` colptr, narrowed `Cint[]` rowval); if
+# a future hot loop needs allocation-free SpMM, lift these into a dedicated
+# `AASparseView` cache type.
+
+"""
+    aa_spmm!(Y, A, X) -> Y
+
+Compute `Y ← A · X` in place, where `A` is a `SparseMatrixCSC{Float64, Int}`,
+`X` and `Y` are `StridedMatrix{Float64}` of compatible shape. libSparse
+overwrites `Y` (does not accumulate).
+"""
+function aa_spmm!(
+    Y::StridedMatrix{Cdouble},
+    A::SparseMatrixCSC{Float64, Int},
+    X::StridedMatrix{Cdouble},
+)
+    size(A, 2) == size(X, 1) || throw(
+        DimensionMismatch(
+            "A is $(size(A)), X is $(size(X)); inner dimensions must match.",
+        ),
+    )
+    size(Y, 1) == size(A, 1) && size(Y, 2) == size(X, 2) || throw(
+        DimensionMismatch(
+            "Y has size $(size(Y)); expected $((size(A, 1), size(X, 2))).",
+        ),
+    )
+    stride(X, 1) == 1 || throw(ArgumentError("X must have unit stride in dim 1."))
+    stride(Y, 1) == 1 || throw(ArgumentError("Y must have unit stride in dim 1."))
+    size(X, 2) == 0 && return Y
+    cs, ri = _csc_to_apple(A)
+    sp = SparseMatrix_t(
+        SparseMatrixStructure(
+            Cint(size(A, 1)),
+            Cint(size(A, 2)),
+            pointer(cs),
+            pointer(ri),
+            ATT_ORDINARY,
+            UInt8(1),
+        ),
+        pointer(nonzeros(A)),
+    )
+    GC.@preserve cs ri A X Y _sparse_multiply_matrix!(
+        sp,
+        _dense_matrix(X),
+        _dense_matrix(Y),
+    )
+    return Y
+end
+
+"""
+    aa_spmv!(y, A, x) -> y
+
+Compute `y ← A · x` in place.
+"""
+function aa_spmv!(
+    y::StridedVector{Cdouble},
+    A::SparseMatrixCSC{Float64, Int},
+    x::StridedVector{Cdouble},
+)
+    size(A, 2) == length(x) || throw(DimensionMismatch(
+        "A is $(size(A)), length(x) = $(length(x)).",
+    ))
+    length(y) == size(A, 1) || throw(DimensionMismatch(
+        "length(y) = $(length(y)); expected $(size(A, 1)).",
+    ))
+    stride(x, 1) == 1 || throw(ArgumentError("x must have unit stride."))
+    stride(y, 1) == 1 || throw(ArgumentError("y must have unit stride."))
+    cs, ri = _csc_to_apple(A)
+    sp = SparseMatrix_t(
+        SparseMatrixStructure(
+            Cint(size(A, 1)),
+            Cint(size(A, 2)),
+            pointer(cs),
+            pointer(ri),
+            ATT_ORDINARY,
+            UInt8(1),
+        ),
+        pointer(nonzeros(A)),
+    )
+    GC.@preserve cs ri A x y _sparse_multiply_vector!(
+        sp,
+        _dense_vector(x),
+        _dense_vector(y),
+    )
+    return y
+end
+
+# Convert CSC's 1-based `colptr::Vector{Int}` and `rowval::Vector{Int}` into
+# Apple's 0-based `Clong[]` colstarts and narrowed `Cint[]` rowindices.
+function _csc_to_apple(A::SparseMatrixCSC{Float64, Int})
+    cp = getcolptr(A)
+    rv = rowvals(A)
+    cs = Vector{Clong}(undef, length(cp))
+    ri = Vector{Cint}(undef, length(rv))
+    @inbounds for k in eachindex(cp)
+        cs[k] = Clong(cp[k] - 1)
+    end
+    @inbounds for k in eachindex(rv)
+        ri[k] = Cint(rv[k] - 1)
+    end
+    return cs, ri
+end
diff --git a/src/BA_ABA_matrices.jl b/src/BA_ABA_matrices.jl
index 0cec17048..7902317a3 100644
--- a/src/BA_ABA_matrices.jl
+++ b/src/BA_ABA_matrices.jl
@@ -179,7 +179,7 @@ power flow analysis, sensitivity calculations, and linear power system studies.
 struct ABA_Matrix{
     Ax <: NTuple{2, Vector},
     L <: NTuple{2, Dict},
-    F <: Union{Nothing, KLULinSolveCache{Float64}},
+    F <: Union{Nothing, KLULinSolveCache{Float64, Int64}},
 } <: PowerNetworkMatrix{Float64}
     data::SparseArrays.SparseMatrixCSC{Float64, Int}
     axes::Ax
@@ -358,7 +358,7 @@ Check if an ABA_Matrix has been factorized (i.e., contains LU factorization matr
 """
 is_factorized(ABA::ABA_Matrix{Ax, L, Nothing}) where {Ax, L <: NTuple{2, Dict}} = false
 is_factorized(
-    ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64}},
+    ABA::ABA_Matrix{Ax, L, <:KLULinSolveCache{Float64, Int64}},
 ) where {Ax, L <: NTuple{2, Dict}} = true
 
 # get_index functions: BA_Matrix stores the transposed matrix, thus get index
diff --git a/src/KLUWrapper/KLUWrapper.jl b/src/KLUWrapper/KLUWrapper.jl
index 5e996dfdf..7d50b5914 100644
--- a/src/KLUWrapper/KLUWrapper.jl
+++ b/src/KLUWrapper/KLUWrapper.jl
@@ -74,8 +74,12 @@ export KLULinSolveCache,
     tsolve!,
     solve_sparse!,
     solve_sparse,
+    sort_factors!,
+    condest!,
+    rcond!,
     n_valid,
-    is_factored
+    is_factored,
+    get_reuse_symbolic
 
 include("klu_jll_bindings.jl")
 include("klu_cache.jl")
diff --git a/src/KLUWrapper/klu_cache.jl b/src/KLUWrapper/klu_cache.jl
index 89d519980..6318e3050 100644
--- a/src/KLUWrapper/klu_cache.jl
+++ b/src/KLUWrapper/klu_cache.jl
@@ -5,37 +5,55 @@ A cached KLU linear solver designed for repeated solves against the same
 sparse matrix structure. `numeric_refactor!` and `solve!` allocate nothing
 once the cache is built.
 
-The type parameter `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU
-path. Indices are always `Int64` (SuiteSparse_long).
+Type parameters:
+- `Tv ∈ {Float64, ComplexF64}` selects the real/complex KLU path
+  (`klu_*_factor`/`klu_z*_factor`).
+- `Ti ∈ {Int32, Int64}` selects the index-type entry-point family
+  (`klu_*` for `int`/Int32, `klu_l_*` for `SuiteSparse_long`/Int64).
+  The cache's `colptr`/`rowval`/`col_map` are stored in this type.
 
 `reuse_symbolic` controls whether `symbolic_refactor!` keeps the analysis;
 `check_pattern` adds a structural-equality check on refactor calls and is
 only consulted when reusing.
 """
-mutable struct KLULinSolveCache{Tv <: Union{Float64, ComplexF64}}
-    colptr::Vector{Int64}
-    rowval::Vector{Int64}
+mutable struct KLULinSolveCache{
+    Tv <: Union{Float64, ComplexF64},
+    Ti <: Union{Int32, Int64},
+}
+    colptr::Vector{Ti}
+    rowval::Vector{Ti}
     # Copy of the matrix values used in the most recent numeric factorization.
     # Lets `_recover_factorization!` rebuild a corrupted numeric handle without
     # the caller having to re-supply `A`. Empty before the first factor call.
     nzval::Vector{Tv}
-    common::Base.RefValue{KluLCommon}
-    symbolic::SymbolicPtr
-    numeric::NumericPtr
+    # `Base.RefValue{KluLCommon}` for Int64 or `Base.RefValue{KluCommon}` for
+    # Int32 — we keep the field untyped here because Julia's untyped
+    # `Base.RefValue` lookup is fast enough (the dispatch helpers recover the
+    # concrete type at each callsite via `_common_type(Ti)`) and avoiding a
+    # third type parameter keeps the surface tidy.
+    common::Base.RefValue
+    # Opaque Symbolic/Numeric pointers. `Ptr{Cvoid}` is safe: each callsite
+    # threads through the typed `SymbolicPtr` / `SymbolicPtr32` (resp.
+    # Numeric) at the ccall boundary; on the Julia side the values are
+    # treated as black boxes by every consumer.
+    symbolic::Ptr{Cvoid}
+    numeric::Ptr{Cvoid}
     reuse_symbolic::Bool
     check_pattern::Bool
     # Bounded reusable scratch for `solve_sparse!`. Lazy-grown on first call so
     # the wrapper's working set stays O(n*block) instead of O(n*nrhs); see
     # `solve_sparse_rhs.jl`.
     scratch::Matrix{Tv}
-    col_map::Vector{Int64}
+    col_map::Vector{Ti}
 end
 
-@inline _dim(cache::KLULinSolveCache) = Int64(length(cache.colptr) - 1)
+@inline _dim(cache::KLULinSolveCache{Tv, Ti}) where {Tv, Ti} =
+    Ti(length(cache.colptr) - 1)
 
-Base.size(cache::KLULinSolveCache) = (n = _dim(cache); (n, n))
-Base.size(cache::KLULinSolveCache, d::Integer) = d <= 2 ? _dim(cache) : 1
-Base.eltype(::Type{KLULinSolveCache{Tv}}) where {Tv} = Tv
+Base.size(cache::KLULinSolveCache) = (n = Int(_dim(cache)); (n, n))
+Base.size(cache::KLULinSolveCache, d::Integer) =
+    d <= 2 ? Int(_dim(cache)) : 1
+Base.eltype(::Type{KLULinSolveCache{Tv, Ti}}) where {Tv, Ti} = Tv
 get_reuse_symbolic(cache::KLULinSolveCache) = cache.reuse_symbolic
 
 """
@@ -53,37 +71,363 @@ is_factored(cache::KLULinSolveCache) =
 # truthy form. Kept around for tests that want a uniform numeric reading.
 n_valid(cache::KLULinSolveCache) = is_factored(cache) ? 1 : 0
 
-@inline _factor_call(::Type{Float64}, ap, ai, ax, sym, common) =
-    klu_l_factor(ap, ai, ax, sym, common)
-@inline _factor_call(::Type{ComplexF64}, ap, ai, ax, sym, common) =
-    klu_zl_factor(ap, ai, ax, sym, common)
+# ---------------------------------------------------------------------------
+# Type-paired dispatch helpers — (Tv, Ti) → libklu entry point
+# ---------------------------------------------------------------------------
+
+# Map Ti to its concrete `klu_common` struct type.
+@inline _common_type(::Type{Int32}) = KluCommon
+@inline _common_type(::Type{Int64}) = KluLCommon
+
+# `klu_defaults` initializer.
+@inline _defaults!(::Type{Int32}, common::Ref) = klu_defaults!(common)
+@inline _defaults!(::Type{Int64}, common::Ref) = klu_l_defaults!(common)
+
+# `klu_analyze` returns the symbolic handle. n is widened to `Ti` so the
+# C argument width matches.
+@inline _analyze_call(::Type{Int32}, n, ap, ai, common) =
+    klu_analyze(Cint(n), ap, ai, common)
+@inline _analyze_call(::Type{Int64}, n, ap, ai, common) =
+    klu_l_analyze(Int64(n), ap, ai, common)
+
+# `klu_free_symbolic` — takes the opaque pointer ref and the common ref.
+# `sym_ref` is a `Ref{Ptr{Cvoid}}` on the Julia side; we reinterpret it
+# to the typed `SymbolicPtr` / `SymbolicPtr32` at the ccall site so libklu
+# sees the right pointer width.
+@inline function _free_symbolic!(::Type{Int32}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref)
+    typed = Ref(reinterpret(SymbolicPtr32, sym_ref[]))
+    klu_free_symbolic!(typed, common)
+    sym_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_symbolic!(::Type{Int64}, sym_ref::Ref{Ptr{Cvoid}}, common::Ref)
+    typed = Ref(reinterpret(SymbolicPtr, sym_ref[]))
+    klu_l_free_symbolic!(typed, common)
+    sym_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
 
-@inline _refactor_call(::Type{Float64}, ap, ai, ax, sym, num, common) =
-    klu_l_refactor(ap, ai, ax, sym, num, common)
-@inline _refactor_call(::Type{ComplexF64}, ap, ai, ax, sym, num, common) =
-    klu_zl_refactor(ap, ai, ax, sym, num, common)
+# `klu_factor` — returns the numeric handle as a typed pointer; we
+# reinterpret to `Ptr{Cvoid}` for storage. Dispatch on both Tv and Ti.
+@inline function _factor_call(::Type{Float64}, ::Type{Int32}, ap, ai, ax, sym, common)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common),
+    )
+end
+@inline function _factor_call(::Type{Float64}, ::Type{Int64}, ap, ai, ax, sym, common)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_l_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common),
+    )
+end
+@inline function _factor_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    common,
+)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_z_factor(ap, ai, ax, reinterpret(SymbolicPtr32, sym), common),
+    )
+end
+@inline function _factor_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    common,
+)
+    return reinterpret(
+        Ptr{Cvoid},
+        klu_zl_factor(ap, ai, ax, reinterpret(SymbolicPtr, sym), common),
+    )
+end
 
-@inline _solve_call(::Type{Float64}, sym, num, n, nrhs, b, common) =
-    klu_l_solve(sym, num, n, nrhs, b, common)
-@inline _solve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common) =
-    klu_zl_solve(sym, num, n, nrhs, b, common)
+@inline function _refactor_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_l_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_z_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        common,
+    )
+end
+@inline function _refactor_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    ap,
+    ai,
+    ax,
+    sym,
+    num,
+    common,
+)
+    return klu_zl_refactor(
+        ap, ai, ax,
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        common,
+    )
+end
 
-@inline _tsolve_call(::Type{Float64}, sym, num, n, nrhs, b, common; conjugate = false) =
-    klu_l_tsolve(sym, num, n, nrhs, b, common)
-@inline _tsolve_call(::Type{ComplexF64}, sym, num, n, nrhs, b, common; conjugate = false) =
-    klu_zl_tsolve(sym, num, n, nrhs, b, Cint(conjugate), common)
+@inline function _solve_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_solve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_l_solve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_z_solve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _solve_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common,
+)
+    return klu_zl_solve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
 
-@inline _free_numeric!(::Type{Float64}, num_ref, common) =
-    klu_l_free_numeric!(num_ref, common)
-@inline _free_numeric!(::Type{ComplexF64}, num_ref, common) =
-    klu_zl_free_numeric!(num_ref, common)
+@inline function _tsolve_call(
+    ::Type{Float64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_tsolve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{Float64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_l_tsolve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_z_tsolve(
+        reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num),
+        Cint(n), Cint(nrhs), b, Cint(conjugate), common,
+    )
+end
+@inline function _tsolve_call(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    sym,
+    num,
+    n,
+    nrhs,
+    b,
+    common;
+    conjugate::Bool = false,
+)
+    return klu_zl_tsolve(
+        reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num),
+        Int64(n), Int64(nrhs), b, Cint(conjugate), common,
+    )
+end
+
+@inline function _free_numeric!(
+    ::Type{Float64},
+    ::Type{Int32},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr32, num_ref[]))
+    klu_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{Float64},
+    ::Type{Int64},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr, num_ref[]))
+    klu_l_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{ComplexF64},
+    ::Type{Int32},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr32, num_ref[]))
+    klu_z_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+@inline function _free_numeric!(
+    ::Type{ComplexF64},
+    ::Type{Int64},
+    num_ref::Ref{Ptr{Cvoid}},
+    common::Ref,
+)
+    typed = Ref(reinterpret(NumericPtr, num_ref[]))
+    klu_zl_free_numeric!(typed, common)
+    num_ref[] = reinterpret(Ptr{Cvoid}, typed[])
+    return nothing
+end
+
+# Performance-knob dispatchers — Float64 only. libklu exposes these in the
+# ComplexF64 build too; we'll add bindings if a consumer asks.
+@inline _sort_call(::Type{Int32}, sym, num, common) =
+    klu_sort(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common)
+@inline _sort_call(::Type{Int64}, sym, num, common) =
+    klu_l_sort(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common)
+
+@inline _condest_call(::Type{Int32}, ap, ax, sym, num, common) =
+    klu_condest(
+        ap,
+        ax,
+        reinterpret(SymbolicPtr32, sym),
+        reinterpret(NumericPtr32, num),
+        common,
+    )
+@inline _condest_call(::Type{Int64}, ap, ax, sym, num, common) =
+    klu_l_condest(
+        ap,
+        ax,
+        reinterpret(SymbolicPtr, sym),
+        reinterpret(NumericPtr, num),
+        common,
+    )
+
+@inline _rcond_call(::Type{Int32}, sym, num, common) =
+    klu_rcond(reinterpret(SymbolicPtr32, sym), reinterpret(NumericPtr32, num), common)
+@inline _rcond_call(::Type{Int64}, sym, num, common) =
+    klu_l_rcond(reinterpret(SymbolicPtr, sym), reinterpret(NumericPtr, num), common)
+
+# ---------------------------------------------------------------------------
+# Constructor
+# ---------------------------------------------------------------------------
 
 """
     KLULinSolveCache(A; reuse_symbolic=true, check_pattern=true)
 
-Build a cache for the sparse matrix `A`. Allocates structural arrays and
-runs `klu_l_defaults`, but does **not** factorize. Call `full_factor!`
-(or `symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
+Build a cache for the sparse matrix `A`. The cache's index type is taken
+from `A`: `SparseMatrixCSC{Tv, Int32}` ⇒ `KLULinSolveCache{Tv, Int32}`,
+`SparseMatrixCSC{Tv, Int64}` ⇒ `KLULinSolveCache{Tv, Int64}`. Allocates
+structural arrays and runs the corresponding `klu_defaults`/`klu_l_defaults`
+initializer, but does **not** factorize. Call `full_factor!` (or
+`symbolic_factor!` followed by `numeric_refactor!`) before `solve!`.
 
 A finalizer frees libklu handles on GC; call `Base.finalize(cache)` to
 release them eagerly. Releasing the handles leaves Julia-side state intact,
@@ -91,34 +435,30 @@ so the cache can be re-factorized via `symbolic_factor!`/`numeric_refactor!`
 or `full_factor!`.
 """
 function KLULinSolveCache(
-    A::SparseMatrixCSC{Tv, Int};
+    A::SparseMatrixCSC{Tv, Ti};
     reuse_symbolic::Bool = true,
     check_pattern::Bool = true,
-) where {Tv <: Union{Float64, ComplexF64}}
-    Int === Int64 || error(
-        "KLULinSolveCache requires a 64-bit Julia build (SuiteSparse_long " *
-        "bindings need Int64 indices). Got Int = $(Int).",
-    )
+) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}}
     n = size(A, 1)
-    n == size(A, 2) || throw(DimensionMismatch("matrix must be square; got $(size(A))"))
+    n == size(A, 2) ||
+        throw(DimensionMismatch("matrix must be square; got $(size(A))"))
 
-    common = Ref(KluLCommon())
-    klu_l_defaults!(common)
+    common = Ref(_common_type(Ti)())
+    _defaults!(Ti, common)
 
-    colptr = Vector{Int64}(undef, length(getcolptr(A)))
+    colptr = Vector{Ti}(undef, length(getcolptr(A)))
     copyto!(colptr, getcolptr(A))
-    colptr .-= 1
-    rowval = Vector{Int64}(undef, length(rowvals(A)))
+    colptr .-= one(Ti)
+    rowval = Vector{Ti}(undef, length(rowvals(A)))
     copyto!(rowval, rowvals(A))
-    rowval .-= 1
+    rowval .-= one(Ti)
 
-    cache = KLULinSolveCache{Tv}(
+    cache = KLULinSolveCache{Tv, Ti}(
         colptr, rowval, Tv[], common,
-        convert(SymbolicPtr, C_NULL),
-        convert(NumericPtr, C_NULL),
+        Ptr{Cvoid}(C_NULL), Ptr{Cvoid}(C_NULL),
         reuse_symbolic, check_pattern,
         Matrix{Tv}(undef, 0, 0),
-        Int64[],
+        Ti[],
     )
     finalizer(_free_klu_handles!, cache)
     return cache
@@ -131,10 +471,10 @@ Ensure `cache.scratch` is at least `n × block` and `cache.col_map` length
 `block`. Grows in place; reuses across `solve_sparse!` calls.
 """
 @inline function _ensure_scratch!(
-    cache::KLULinSolveCache{Tv},
+    cache::KLULinSolveCache{Tv, Ti},
     block::Int,
-) where {Tv <: Union{Float64, ComplexF64}}
-    n = _dim(cache)
+) where {Tv, Ti}
+    n = Int(_dim(cache))
     s = cache.scratch
     if size(s, 1) != n || size(s, 2) < block
         cache.scratch = Matrix{Tv}(undef, n, block)
@@ -153,16 +493,16 @@ second call hits the `C_NULL` guards. Used both by `symbolic_factor!`
 mid-life (drop old handles before re-analyzing) and by the GC finalizer.
 """
 function _free_klu_handles!(
-    cache::KLULinSolveCache{Tv},
-) where {Tv <: Union{Float64, ComplexF64}}
+    cache::KLULinSolveCache{Tv, Ti},
+) where {Tv, Ti}
     if cache.numeric != C_NULL
         num_ref = Ref(cache.numeric)
-        _free_numeric!(Tv, num_ref, cache.common)
+        _free_numeric!(Tv, Ti, num_ref, cache.common)
         cache.numeric = num_ref[]
     end
     if cache.symbolic != C_NULL
         sym_ref = Ref(cache.symbolic)
-        klu_l_free_symbolic!(sym_ref, cache.common)
+        _free_symbolic!(Ti, sym_ref, cache.common)
         cache.symbolic = sym_ref[]
     end
     return nothing
@@ -180,8 +520,8 @@ scope. Requires that a numeric factor has been built before (so `cache.nzval`
 is populated) and the symbolic factor is still valid.
 """
 function _recover_factorization!(
-    cache::KLULinSolveCache{Tv},
-) where {Tv <: Union{Float64, ComplexF64}}
+    cache::KLULinSolveCache{Tv, Ti},
+) where {Tv, Ti}
     cache.symbolic == C_NULL && error(
         "KLULinSolveCache: cannot recover without a symbolic factor.",
     )
@@ -190,11 +530,12 @@ function _recover_factorization!(
     )
     if cache.numeric != C_NULL
         num_ref = Ref(cache.numeric)
-        _free_numeric!(Tv, num_ref, cache.common)
+        _free_numeric!(Tv, Ti, num_ref, cache.common)
         cache.numeric = num_ref[]
     end
     num = _factor_call(
-        Tv, pointer(cache.colptr), pointer(cache.rowval),
+        Tv, Ti,
+        pointer(cache.colptr), pointer(cache.rowval),
         pointer(cache.nzval), cache.symbolic, cache.common,
     )
     num == C_NULL && klu_throw(cache.common[], "klu_factor (recovery)")
@@ -202,8 +543,11 @@ function _recover_factorization!(
     return cache
 end
 
-@inline function _check_pattern_match(cache::KLULinSolveCache,
-    A::SparseMatrixCSC, op::AbstractString)
+@inline function _check_pattern_match(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC,
+    op::AbstractString,
+) where {Tv, Ti}
     Acolptr = getcolptr(A)
     Arowval = rowvals(A)
     if length(Acolptr) != length(cache.colptr) ||
@@ -217,13 +561,13 @@ end
     # Increment-compare-decrement: avoids allocating a 1-indexed copy. The
     # `try/finally` restores `colptr`/`rowval` even on InterruptException so
     # the cache is never left with off-by-one structural arrays.
-    cache.colptr .+= 1
-    cache.rowval .+= 1
+    cache.colptr .+= one(Ti)
+    cache.rowval .+= one(Ti)
     bad = try
         (cache.colptr != Acolptr) || (cache.rowval != Arowval)
     finally
-        cache.colptr .-= 1
-        cache.rowval .-= 1
+        cache.colptr .-= one(Ti)
+        cache.rowval .-= one(Ti)
     end
     if bad
         throw(ArgumentError(
@@ -237,15 +581,19 @@ end
     symbolic_factor!(cache, A)
 
 Free any cached symbolic/numeric factor, replace the structural arrays with
-`A`'s pattern, and run `klu_l_analyze`.
+`A`'s pattern, and run `klu_analyze` / `klu_l_analyze`.
 """
-function symbolic_factor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function symbolic_factor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     n = _dim(cache)
-    if size(A, 1) != n || size(A, 2) != n
-        throw(DimensionMismatch(
-            "Cannot factor: cache is $(n)×$(n) but A is $(size(A)).",
-        ))
+    if size(A, 1) != Int(n) || size(A, 2) != Int(n)
+        throw(
+            DimensionMismatch(
+                "Cannot factor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).",
+            ),
+        )
     end
     _free_klu_handles!(cache)
 
@@ -253,16 +601,14 @@ function symbolic_factor!(cache::KLULinSolveCache{Tv},
     Arowval = rowvals(A)
     resize!(cache.colptr, length(Acolptr))
     copyto!(cache.colptr, Acolptr)
-    cache.colptr .-= 1
+    cache.colptr .-= one(Ti)
     resize!(cache.rowval, length(Arowval))
     copyto!(cache.rowval, Arowval)
-    cache.rowval .-= 1
+    cache.rowval .-= one(Ti)
 
-    sym = klu_l_analyze(
-        Int64(n), pointer(cache.colptr), pointer(cache.rowval), cache.common,
-    )
-    sym == C_NULL && klu_throw(cache.common[], "klu_l_analyze")
-    cache.symbolic = sym
+    sym = _analyze_call(Ti, n, pointer(cache.colptr), pointer(cache.rowval), cache.common)
+    sym == C_NULL && klu_throw(cache.common[], "klu_analyze")
+    cache.symbolic = reinterpret(Ptr{Cvoid}, sym)
     return cache
 end
 
@@ -272,17 +618,19 @@ end
 If `cache.reuse_symbolic`, optionally verify the structure matches and reuse
 the existing analysis. Otherwise, rerun `symbolic_factor!`.
 """
-function symbolic_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function symbolic_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     if !cache.reuse_symbolic
         return symbolic_factor!(cache, A)
     end
     if cache.check_pattern
         n = _dim(cache)
-        if size(A, 1) != n || size(A, 2) != n
+        if size(A, 1) != Int(n) || size(A, 2) != Int(n)
             throw(
                 DimensionMismatch(
-                    "Cannot refactor: cache is $(n)×$(n) but A is $(size(A)).",
+                    "Cannot refactor: cache is $(Int(n))×$(Int(n)) but A is $(size(A)).",
                 ),
             )
         end
@@ -298,15 +646,18 @@ Compute (or refresh) the numeric factorization. The first call after
 `symbolic_factor!` invokes `klu_*_factor`; subsequent calls invoke
 `klu_*_refactor` and reuse the existing numeric struct.
 """
-function numeric_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function numeric_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     cache.symbolic == C_NULL && error(
         "KLULinSolveCache: call symbolic_factor! before numeric_refactor!.",
     )
     Anz = nonzeros(A)
     if cache.numeric == C_NULL
         num = _factor_call(
-            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            Tv, Ti,
+            pointer(cache.colptr), pointer(cache.rowval),
             pointer(Anz), cache.symbolic, cache.common,
         )
         num == C_NULL && klu_throw(cache.common[], "klu_factor")
@@ -314,7 +665,8 @@ function numeric_refactor!(cache::KLULinSolveCache{Tv},
     else
         cache.check_pattern && _check_pattern_match(cache, A, "numeric_refactor")
         ok = _refactor_call(
-            Tv, pointer(cache.colptr), pointer(cache.rowval),
+            Tv, Ti,
+            pointer(cache.colptr), pointer(cache.rowval),
             pointer(Anz), cache.symbolic, cache.numeric, cache.common,
         )
         ok != 1 && klu_throw(cache.common[], "klu_refactor")
@@ -334,8 +686,10 @@ Equivalent to `symbolic_factor!(cache, A); numeric_refactor!(cache, A)`. Use
 this on a freshly constructed cache, or after `_free_klu_handles!` has cleared
 the handles, to bring the cache to a factored state.
 """
-function full_factor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function full_factor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     symbolic_factor!(cache, A)
     numeric_refactor!(cache, A)
     return cache
@@ -351,8 +705,10 @@ the matrix values have changed; if the structure has also changed and the
 cache was built with `reuse_symbolic = false`, the symbolic analysis is rerun
 as well.
 """
-function full_refactor!(cache::KLULinSolveCache{Tv},
-    A::SparseMatrixCSC{Tv, Int}) where {Tv <: Union{Float64, ComplexF64}}
+function full_refactor!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseMatrixCSC{Tv, Ti},
+) where {Tv, Ti}
     symbolic_refactor!(cache, A)
     numeric_refactor!(cache, A)
     return cache
@@ -363,11 +719,94 @@ end
 
 Build a cache for `A` and immediately compute the full factorization.
 """
-function klu_factorize(A::SparseMatrixCSC{Tv, Int};
+function klu_factorize(
+    A::SparseMatrixCSC{Tv, Ti};
     reuse_symbolic::Bool = true,
     check_pattern::Bool = true,
-) where {Tv <: Union{Float64, ComplexF64}}
-    cache = KLULinSolveCache(A;
-        reuse_symbolic = reuse_symbolic, check_pattern = check_pattern)
+) where {Tv <: Union{Float64, ComplexF64}, Ti <: Union{Int32, Int64}}
+    cache = KLULinSolveCache(
+        A;
+        reuse_symbolic = reuse_symbolic,
+        check_pattern = check_pattern,
+    )
     return full_factor!(cache, A)
 end
+
+# ---------------------------------------------------------------------------
+# Performance / diagnostic surface
+# ---------------------------------------------------------------------------
+
+"""
+    sort_factors!(cache) -> cache
+
+Sort the columns of the cached L and U factors in place via libklu's
+`klu_sort` / `klu_l_sort`. KLU's numeric phase stores factor columns in
+arbitrary order; sorting once after the first factor improves cache locality
+on every subsequent `solve!` / `tsolve!`. The cost is `O(nnz_factor)` and
+is amortized over many repeated solves — a win whenever the cache is used
+for ≥ a few solves on the same factorization.
+
+Idempotent and `refactor`-stable: sorting after the initial factor persists
+through `numeric_refactor!` because refactor preserves the column layout.
+Only call this if the cache will be reused for multiple solves; for a
+one-shot solve it is pure overhead.
+
+Float64 only.
+"""
+function sort_factors!(cache::KLULinSolveCache{Float64, Ti}) where {Ti}
+    is_factored(cache) ||
+        error("sort_factors!: cache must be factored before sorting.")
+    ok = _sort_call(Ti, cache.symbolic, cache.numeric, cache.common)
+    ok != 1 && klu_throw(cache.common[], "klu_sort")
+    return cache
+end
+
+"""
+    condest!(cache) -> Float64
+
+Compute the 1-norm condition-number estimate of the cached factorization
+via libklu's `klu_condest`. The result lands in `cache.common[].condest`
+and is also returned. Cost is roughly two extra solves; use sparingly.
+
+Useful when deciding whether iterative refinement is worth running, or for
+flagging near-singular Jacobians in Newton-Raphson loops.
+
+Float64 only.
+"""
+function condest!(
+    cache::KLULinSolveCache{Float64, Ti},
+) where {Ti}
+    is_factored(cache) ||
+        error("condest!: cache must be factored before condest.")
+    isempty(cache.nzval) && error(
+        "condest!: requires a previous numeric_refactor! to have populated nzval.",
+    )
+    ok = _condest_call(
+        Ti,
+        pointer(cache.colptr),
+        pointer(cache.nzval),
+        cache.symbolic,
+        cache.numeric,
+        cache.common,
+    )
+    ok != 1 && klu_throw(cache.common[], "klu_condest")
+    return Float64(cache.common[].condest)
+end
+
+"""
+    rcond!(cache) -> Float64
+
+Compute the cheap reciprocal-condition estimate
+`min(|diag(U)|)/max(|diag(U)|)` via libklu's `klu_rcond`. The result lands
+in `cache.common[].rcond` and is also returned. Faster than `condest!` but
+less reliable as a conditioning indicator.
+
+Float64 only.
+"""
+function rcond!(cache::KLULinSolveCache{Float64, Ti}) where {Ti}
+    is_factored(cache) ||
+        error("rcond!: cache must be factored before rcond.")
+    ok = _rcond_call(Ti, cache.symbolic, cache.numeric, cache.common)
+    ok != 1 && klu_throw(cache.common[], "klu_rcond")
+    return Float64(cache.common[].rcond)
+end
diff --git a/src/KLUWrapper/klu_jll_bindings.jl b/src/KLUWrapper/klu_jll_bindings.jl
index ff0ed0a92..b525a6572 100644
--- a/src/KLUWrapper/klu_jll_bindings.jl
+++ b/src/KLUWrapper/klu_jll_bindings.jl
@@ -1,9 +1,32 @@
-# Bindings into libklu (SuiteSparse_jll), restricted to the SuiteSparse_long
-# (`klu_l_*` and `klu_zl_*`) entry points used by KLULinSolveCache.
+# Bindings into libklu (SuiteSparse_jll), covering both index-type
+# entry-point families:
+#   - `klu_l_*` / `klu_zl_*` for `SuiteSparse_long` (Int64) indices
+#   - `klu_*`   / `klu_z_*`  for `int`              (Int32) indices
+#
+# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the
+# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths
+# (`klu_*_free_*`), which can fire on any thread at any safepoint and would
+# otherwise race against in-flight `solve!` calls on a different cache. See
+# `_LIBKLU_LOCK` in `KLUWrapper.jl` for the empirical evidence (intermittent
+# `KLU_INVALID` return; SEGV at `klu_solve.c:118`).
 
 import LinearAlgebra
 import SuiteSparse_jll: libklu
 
+# ---------------------------------------------------------------------------
+# Status codes and shared error helper
+# ---------------------------------------------------------------------------
+
+const KLU_OK = 0
+const KLU_SINGULAR = 1
+const KLU_OUT_OF_MEMORY = -2
+const KLU_INVALID = -3
+const KLU_TOO_LARGE = -4
+
+# ---------------------------------------------------------------------------
+# Int64 (SuiteSparse_long) family
+# ---------------------------------------------------------------------------
+
 # Layout matches `klu_l_common` in upstream `klu.h`. Must stay in sync.
 mutable struct KluLCommon
     tol::Cdouble
@@ -40,19 +63,15 @@ mutable struct KluLNumeric end
 const SymbolicPtr = Ptr{KluLSymbolic}
 const NumericPtr = Ptr{KluLNumeric}
 
-# Each ccall is wrapped in `@klu_lock` so that all libklu activity in the
-# process serializes through `_LIBKLU_LOCK`. This includes finalizer paths
-# (`klu_l_free_*`, `klu_zl_free_*`), which can fire on any thread at any
-# safepoint and would otherwise race against in-flight `solve!` calls on a
-# different cache. See `_LIBKLU_LOCK` in `KLUWrapper.jl` for the
-# empirical evidence (intermittent `KLU_INVALID` return; SEGV at
-# `klu_solve.c:118`).
-
 klu_l_defaults!(common::Ref{KluLCommon}) =
     @klu_lock ccall((:klu_l_defaults, libklu), Cint, (Ptr{KluLCommon},), common)
 
-function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64},
-    common::Ref{KluLCommon})
+function klu_l_analyze(
+    n::Int64,
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_analyze, libklu),
         SymbolicPtr,
@@ -61,8 +80,10 @@ function klu_l_analyze(n::Int64, ap::Ptr{Int64}, ai::Ptr{Int64},
     )
 end
 
-function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr},
-    common::Ref{KluLCommon})
+function klu_l_free_symbolic!(
+    symbolic_ref::Ref{SymbolicPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_free_symbolic, libklu),
         Cint,
@@ -71,8 +92,13 @@ function klu_l_free_symbolic!(symbolic_ref::Ref{SymbolicPtr},
     )
 end
 
-function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
-    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+function klu_l_factor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_factor, libklu),
         NumericPtr,
@@ -81,19 +107,37 @@ function klu_l_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
     )
 end
 
-function klu_l_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{Cdouble},
-    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+function klu_l_refactor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_refactor, libklu),
         Cint,
-        (Ptr{Int64}, Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr,
-            Ptr{KluLCommon}),
+        (
+            Ptr{Int64},
+            Ptr{Int64},
+            Ptr{Cdouble},
+            SymbolicPtr,
+            NumericPtr,
+            Ptr{KluLCommon},
+        ),
         ap, ai, ax, symbolic, numeric, common,
     )
 end
 
-function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+function klu_l_solve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{Cdouble},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_solve, libklu),
         Cint,
@@ -102,8 +146,14 @@ function klu_l_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
     )
 end
 
-function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{Cdouble}, common::Ref{KluLCommon})
+function klu_l_tsolve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{Cdouble},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_tsolve, libklu),
         Cint,
@@ -112,8 +162,10 @@ function klu_l_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
     )
 end
 
-function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr},
-    common::Ref{KluLCommon})
+function klu_l_free_numeric!(
+    numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_l_free_numeric, libklu),
         Cint,
@@ -122,8 +174,69 @@ function klu_l_free_numeric!(numeric_ref::Ref{NumericPtr},
     )
 end
 
-function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
-    symbolic::SymbolicPtr, common::Ref{KluLCommon})
+# Performance knobs / diagnostics — Int64 family.
+
+# klu_l_sort sorts the columns of the L and U factors in place. KLU's
+# numeric phase stores L/U columns in arbitrary order; sorting once after
+# the factor improves cache locality on every subsequent solve. Cheap
+# (O(nnz_factor)) and idempotent. Call once after the first numeric
+# factor; refactor preserves the sort.
+function klu_l_sort(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_sort, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# klu_l_condest computes a 1-norm condition number estimate, populating
+# `common.condest`. Costs roughly two extra solves. Useful for iterative
+# refinement (informs tolerance choice) and as a diagnostic for near-singular
+# matrices. Caller reads result from `cache.common[].condest`.
+function klu_l_condest(
+    ap::Ptr{Int64},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_condest, libklu),
+        Cint,
+        (Ptr{Int64}, Ptr{Cdouble}, SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        ap, ax, symbolic, numeric, common,
+    )
+end
+
+# klu_l_rcond fills `common.rcond` with the cheap diagonal-ratio
+# reciprocal condition estimate (min(|diag(U)|)/max(|diag(U)|)). Faster
+# than condest but less reliable.
+function klu_l_rcond(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
+    return @klu_lock ccall(
+        (:klu_l_rcond, libklu),
+        Cint,
+        (SymbolicPtr, NumericPtr, Ptr{KluLCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# Complex / Int64
+function klu_zl_factor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_factor, libklu),
         NumericPtr,
@@ -132,42 +245,74 @@ function klu_zl_factor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
     )
 end
 
-function klu_zl_refactor(ap::Ptr{Int64}, ai::Ptr{Int64}, ax::Ptr{ComplexF64},
-    symbolic::SymbolicPtr, numeric::NumericPtr, common::Ref{KluLCommon})
+function klu_zl_refactor(
+    ap::Ptr{Int64},
+    ai::Ptr{Int64},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_refactor, libklu),
         Cint,
-        (Ptr{Int64}, Ptr{Int64}, Ptr{ComplexF64}, SymbolicPtr, NumericPtr,
-            Ptr{KluLCommon}),
+        (
+            Ptr{Int64},
+            Ptr{Int64},
+            Ptr{ComplexF64},
+            SymbolicPtr,
+            NumericPtr,
+            Ptr{KluLCommon},
+        ),
         ap, ai, ax, symbolic, numeric, common,
     )
 end
 
-function klu_zl_solve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, common::Ref{KluLCommon})
+function klu_zl_solve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{ComplexF64},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_solve, libklu),
         Cint,
-        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64},
-            Ptr{KluLCommon}),
+        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Ptr{KluLCommon}),
         symbolic, numeric, ldim, nrhs, b, common,
     )
 end
 
-function klu_zl_tsolve(symbolic::SymbolicPtr, numeric::NumericPtr,
-    ldim::Int64, nrhs::Int64, b::Ptr{ComplexF64}, conj_solve::Cint,
-    common::Ref{KluLCommon})
+function klu_zl_tsolve(
+    symbolic::SymbolicPtr,
+    numeric::NumericPtr,
+    ldim::Int64,
+    nrhs::Int64,
+    b::Ptr{ComplexF64},
+    conj_solve::Cint,
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_tsolve, libklu),
         Cint,
-        (SymbolicPtr, NumericPtr, Int64, Int64, Ptr{ComplexF64}, Cint,
-            Ptr{KluLCommon}),
+        (
+            SymbolicPtr,
+            NumericPtr,
+            Int64,
+            Int64,
+            Ptr{ComplexF64},
+            Cint,
+            Ptr{KluLCommon},
+        ),
         symbolic, numeric, ldim, nrhs, b, conj_solve, common,
     )
 end
 
-function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr},
-    common::Ref{KluLCommon})
+function klu_zl_free_numeric!(
+    numeric_ref::Ref{NumericPtr},
+    common::Ref{KluLCommon},
+)
     return @klu_lock ccall(
         (:klu_zl_free_numeric, libklu),
         Cint,
@@ -176,14 +321,305 @@ function klu_zl_free_numeric!(numeric_ref::Ref{NumericPtr},
     )
 end
 
-# Status codes from klu.h.
-const KLU_OK = 0
-const KLU_SINGULAR = 1
-const KLU_OUT_OF_MEMORY = -2
-const KLU_INVALID = -3
-const KLU_TOO_LARGE = -4
+# ---------------------------------------------------------------------------
+# Int32 (int) family — mirror of the Int64 set above
+# ---------------------------------------------------------------------------
+
+# Layout matches `klu_common` (Int32 path) in upstream `klu.h`. Differs from
+# `KluLCommon` only in the four rank/diag fields, which are `int` instead of
+# `SuiteSparse_long`.
+mutable struct KluCommon
+    tol::Cdouble
+    memgrow::Cdouble
+    initmem_amd::Cdouble
+    initmem::Cdouble
+    maxwork::Cdouble
+    btf::Cint
+    ordering::Cint
+    scale::Cint
+    user_order::Ptr{Cvoid}
+    user_data::Ptr{Cvoid}
+    halt_if_singular::Cint
+    status::Cint
+    nrealloc::Cint
+    structural_rank::Cint
+    numerical_rank::Cint
+    singular_col::Cint
+    noffdiag::Cint
+    flops::Cdouble
+    rcond::Cdouble
+    condest::Cdouble
+    rgrowth::Cdouble
+    work::Cdouble
+    memusage::Csize_t
+    mempeak::Csize_t
+    KluCommon() = new()
+end
+
+mutable struct KluSymbolic end
+mutable struct KluNumeric end
+
+const SymbolicPtr32 = Ptr{KluSymbolic}
+const NumericPtr32 = Ptr{KluNumeric}
+
+klu_defaults!(common::Ref{KluCommon}) =
+    @klu_lock ccall((:klu_defaults, libklu), Cint, (Ptr{KluCommon},), common)
+
+function klu_analyze(
+    n::Cint,
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_analyze, libklu),
+        SymbolicPtr32,
+        (Cint, Ptr{Cint}, Ptr{Cint}, Ptr{KluCommon}),
+        n, ap, ai, common,
+    )
+end
+
+function klu_free_symbolic!(
+    symbolic_ref::Ref{SymbolicPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_free_symbolic, libklu),
+        Cint,
+        (Ptr{SymbolicPtr32}, Ptr{KluCommon}),
+        symbolic_ref, common,
+    )
+end
+
+function klu_factor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_factor, libklu),
+        NumericPtr32,
+        (Ptr{Cint}, Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, Ptr{KluCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_refactor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_refactor, libklu),
+        Cint,
+        (
+            Ptr{Cint},
+            Ptr{Cint},
+            Ptr{Cdouble},
+            SymbolicPtr32,
+            NumericPtr32,
+            Ptr{KluCommon},
+        ),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_solve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{Cdouble},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_solve, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_tsolve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{Cdouble},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_tsolve, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Cint, Cint, Ptr{Cdouble}, Ptr{KluCommon}),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_free_numeric!(
+    numeric_ref::Ref{NumericPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr32}, Ptr{KluCommon}),
+        numeric_ref, common,
+    )
+end
+
+# Performance knobs / diagnostics — Int32 family.
+
+function klu_sort(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_sort, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+function klu_condest(
+    ap::Ptr{Cint},
+    ax::Ptr{Cdouble},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_condest, libklu),
+        Cint,
+        (Ptr{Cint}, Ptr{Cdouble}, SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        ap, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_rcond(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_rcond, libklu),
+        Cint,
+        (SymbolicPtr32, NumericPtr32, Ptr{KluCommon}),
+        symbolic, numeric, common,
+    )
+end
+
+# Complex / Int32
+function klu_z_factor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_factor, libklu),
+        NumericPtr32,
+        (Ptr{Cint}, Ptr{Cint}, Ptr{ComplexF64}, SymbolicPtr32, Ptr{KluCommon}),
+        ap, ai, ax, symbolic, common,
+    )
+end
+
+function klu_z_refactor(
+    ap::Ptr{Cint},
+    ai::Ptr{Cint},
+    ax::Ptr{ComplexF64},
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_refactor, libklu),
+        Cint,
+        (
+            Ptr{Cint},
+            Ptr{Cint},
+            Ptr{ComplexF64},
+            SymbolicPtr32,
+            NumericPtr32,
+            Ptr{KluCommon},
+        ),
+        ap, ai, ax, symbolic, numeric, common,
+    )
+end
+
+function klu_z_solve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{ComplexF64},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_solve, libklu),
+        Cint,
+        (
+            SymbolicPtr32,
+            NumericPtr32,
+            Cint,
+            Cint,
+            Ptr{ComplexF64},
+            Ptr{KluCommon},
+        ),
+        symbolic, numeric, ldim, nrhs, b, common,
+    )
+end
+
+function klu_z_tsolve(
+    symbolic::SymbolicPtr32,
+    numeric::NumericPtr32,
+    ldim::Cint,
+    nrhs::Cint,
+    b::Ptr{ComplexF64},
+    conj_solve::Cint,
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_tsolve, libklu),
+        Cint,
+        (
+            SymbolicPtr32,
+            NumericPtr32,
+            Cint,
+            Cint,
+            Ptr{ComplexF64},
+            Cint,
+            Ptr{KluCommon},
+        ),
+        symbolic, numeric, ldim, nrhs, b, conj_solve, common,
+    )
+end
+
+function klu_z_free_numeric!(
+    numeric_ref::Ref{NumericPtr32},
+    common::Ref{KluCommon},
+)
+    return @klu_lock ccall(
+        (:klu_z_free_numeric, libklu),
+        Cint,
+        (Ptr{NumericPtr32}, Ptr{KluCommon}),
+        numeric_ref, common,
+    )
+end
+
+# ---------------------------------------------------------------------------
+# Shared error throw — dispatches on the common-struct width
+# ---------------------------------------------------------------------------
 
-function klu_throw(common::KluLCommon, op::AbstractString)
+function klu_throw(common::Union{KluLCommon, KluCommon}, op::AbstractString)
     s = common.status
     s == KLU_SINGULAR &&
         throw(LinearAlgebra.SingularException(Int(common.singular_col + 1)))
diff --git a/src/KLUWrapper/solve_dense.jl b/src/KLUWrapper/solve_dense.jl
index 7ce29b78c..85d758170 100644
--- a/src/KLUWrapper/solve_dense.jl
+++ b/src/KLUWrapper/solve_dense.jl
@@ -5,17 +5,19 @@ Solve `A · X = B` in place. `B::StridedVecOrMat{Tv}` must have first-dimension
 size equal to `cache.n` and unit stride in the first dimension. Multiple
 columns of `B` are handled in a single libklu call.
 """
-function solve!(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+function solve!(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv},
+) where {Tv, Ti}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
     stride(B, 1) == 1 || throw(ArgumentError(
         "B must have unit stride in the first dimension.",
     ))
-    nrhs = Int64(size(B, 2))
+    nrhs = size(B, 2)
     nrhs == 0 && return B
     # Snapshot KLU's preconditions plus identity info — gated by
     # `KLU_POOL_DEBUG`. `klu_l_solve` returns FALSE with `KLU_INVALID` when
@@ -31,7 +33,7 @@ function solve!(cache::KLULinSolveCache{Tv},
         pre_b_ptr = pointer(B)
     end
     ok = _solve_call(
-        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common,
+        Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common,
     )
     if ok == 0
         @static if KLU_POOL_DEBUG
@@ -57,21 +59,23 @@ In-place solve `Aᵀ · X = B` (or `Aᴴ · X = B` when `conjugate=true` on the
 complex path). Same shape requirements as `solve!`. The `conjugate` keyword
 is ignored on the real path.
 """
-function tsolve!(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}; conjugate::Bool = false,
-) where {Tv <: Union{Float64, ComplexF64}}
+function tsolve!(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv};
+    conjugate::Bool = false,
+) where {Tv, Ti}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
     stride(B, 1) == 1 || throw(ArgumentError(
         "B must have unit stride in the first dimension.",
     ))
-    nrhs = Int64(size(B, 2))
+    nrhs = size(B, 2)
     nrhs == 0 && return B
     ok = _tsolve_call(
-        Tv, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common;
+        Tv, Ti, cache.symbolic, cache.numeric, n, nrhs, pointer(B), cache.common;
         conjugate = conjugate,
     )
     ok == 0 && klu_throw(cache.common[], "klu_tsolve")
@@ -83,7 +87,9 @@ end
 
 Allocating solve, mirroring `LinearAlgebra.Factorization`'s API.
 """
-function Base.:\(cache::KLULinSolveCache{Tv},
-    B::StridedVecOrMat{Tv}) where {Tv <: Union{Float64, ComplexF64}}
+function Base.:\(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::StridedVecOrMat{Tv},
+) where {Tv, Ti}
     return solve!(cache, copy(B))
 end
diff --git a/src/KLUWrapper/solve_sparse_rhs.jl b/src/KLUWrapper/solve_sparse_rhs.jl
index a20679f24..089fa06a0 100644
--- a/src/KLUWrapper/solve_sparse_rhs.jl
+++ b/src/KLUWrapper/solve_sparse_rhs.jl
@@ -20,20 +20,23 @@ Not thread-safe (mutates per-cache scratch). Callers should serialize
 access through a per-cache lock if invoked from multiple threads.
 """
 function solve_sparse!(
-    cache::KLULinSolveCache{Tv},
-    B::SparseMatrixCSC{Tb, Int},
+    cache::KLULinSolveCache{Tv, Ti},
+    B::SparseMatrixCSC{Tb, <:Integer},
     out::AbstractMatrix{Tv};
     block::Int = SPARSE_RHS_DEFAULT_BLOCK,
-) where {Tv <: Union{Float64, ComplexF64}, Tb <: Number}
+) where {Tv, Ti, Tb <: Number}
     is_factored(cache) || error("KLULinSolveCache: not factored yet.")
     block >= 1 || throw(ArgumentError("block must be >= 1; got $(block)"))
     n = _dim(cache)
-    size(B, 1) == n || throw(DimensionMismatch(
-        "size(B, 1) = $(size(B, 1)), cache n = $(n)",
-    ))
-    size(out, 1) == n && size(out, 2) == size(B, 2) || throw(DimensionMismatch(
-        "out has size $(size(out)); expected $((n, size(B, 2))).",
+    size(B, 1) == Int(n) || throw(DimensionMismatch(
+        "size(B, 1) = $(size(B, 1)), cache n = $(Int(n))",
     ))
+    size(out, 1) == Int(n) && size(out, 2) == size(B, 2) ||
+        throw(
+            DimensionMismatch(
+                "out has size $(size(out)); expected $((Int(n), size(B, 2))).",
+            ),
+        )
 
     nb = size(B, 2)
     nb == 0 && return out
@@ -71,7 +74,7 @@ function solve_sparse!(
             # helper) to keep the inner-loop closure from capturing the per-chunk
             # `npack`, which would force Julia to box it.
             ok = _solve_call(
-                Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                Tv, Ti, cache.symbolic, cache.numeric, n, npack,
                 pointer(scratch), cache.common,
             )
             if ok == 0 && cache.common[].status == KLU_INVALID
@@ -79,7 +82,7 @@ function solve_sparse!(
                     objectid(cache)
                 _recover_factorization!(cache)
                 ok = _solve_call(
-                    Tv, cache.symbolic, cache.numeric, n, Int64(npack),
+                    Tv, Ti, cache.symbolic, cache.numeric, n, npack,
                     pointer(scratch), cache.common,
                 )
             end
@@ -96,12 +99,13 @@ function solve_sparse!(
 end
 
 """Allocating wrapper around `solve_sparse!`."""
-function solve_sparse(cache::KLULinSolveCache{Tv},
-    B::SparseMatrixCSC{<:Number, Int};
+function solve_sparse(
+    cache::KLULinSolveCache{Tv, Ti},
+    B::SparseMatrixCSC{<:Number, <:Integer};
     block::Int = SPARSE_RHS_DEFAULT_BLOCK,
-) where {Tv <: Union{Float64, ComplexF64}}
+) where {Tv, Ti}
     return solve_sparse!(
-        cache, B, Matrix{Tv}(undef, _dim(cache), size(B, 2));
+        cache, B, Matrix{Tv}(undef, Int(_dim(cache)), size(B, 2));
         block = block,
     )
 end
diff --git a/src/PowerNetworkMatrices.jl b/src/PowerNetworkMatrices.jl
index 5239f5859..23ce6e4c5 100644
--- a/src/PowerNetworkMatrices.jl
+++ b/src/PowerNetworkMatrices.jl
@@ -89,8 +89,12 @@ import .KLUWrapper:
     n_valid,
     is_factored
 
+include("AccelerateWrapper/AccelerateWrapper.jl")
+import .AccelerateWrapper: AAFactorCache, aa_factorize, aa_spmm!, aa_spmv!
+
 include("linalg_settings.jl")
 include("solver_dispatch.jl")
+include("iterative_refinement.jl")
 
 function __init__()
     something(get_linalg_backend_check(), false) && check_linalg_backend()
@@ -137,14 +141,12 @@ include("virtual_modf_calculations.jl")
 include("system_utils.jl")
 include("serialization.jl")
 
-# Declare functions that will be defined by extensions
-# These need to be declared so extensions can extend them
+# Forward declarations for symbols still defined inside package extensions.
+# AppleAccelerate-related functions live in `src/` now (no extension needed)
+# and are not redeclared here.
 function _calculate_PTDF_matrix_MKLPardiso end
-function _calculate_PTDF_matrix_AppleAccelerate end
 function _calculate_LODF_matrix_MKLPardiso end
-function _calculate_LODF_matrix_AppleAccelerate end
 function _pardiso_sequential_LODF! end
 function _pardiso_single_LODF! end
-function _create_apple_accelerate_factorization end
 
 end
diff --git a/src/PowerflowMatrixTypes.jl b/src/PowerflowMatrixTypes.jl
index 98d103289..cd1aa2ab9 100644
--- a/src/PowerflowMatrixTypes.jl
+++ b/src/PowerflowMatrixTypes.jl
@@ -1,7 +1,7 @@
 const DC_ABA_Matrix_Factorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
     Tuple{Dict{Int64, Int64}, Dict{Int64, Int64}},
-    KLULinSolveCache{Float64},
+    KLULinSolveCache{Float64, Int64},
 }
 const DC_ABA_Matrix_Unfactorized = ABA_Matrix{
     Tuple{Vector{Int64}, Vector{Int64}},
diff --git a/src/common.jl b/src/common.jl
index 70a1b50ff..c506170ad 100644
--- a/src/common.jl
+++ b/src/common.jl
@@ -6,6 +6,23 @@ function _add_to_collection!(
     return
 end
 
+"""
+    _build_bus_to_valid_idx(n_buses, valid_ix) -> Vector{Int}
+
+Build the inverse of `valid_ix`: a length-`n_buses` vector where entry `b`
+is the position of bus `b` inside `valid_ix`, or 0 if `b` is a reference
+bus. Used by the Virtual\\* row-computation hot path so it can iterate the
+nonzeros of a sparse `BA` column directly (O(nnz_col)) instead of scanning
+the full bus axis (O(n_buses)) and bisecting the CSC for each entry.
+"""
+function _build_bus_to_valid_idx(n_buses::Int, valid_ix::Vector{Int})
+    bus_to_valid_idx = zeros(Int, n_buses)
+    @inbounds for (i, b) in enumerate(valid_ix)
+        bus_to_valid_idx[b] = i
+    end
+    return bus_to_valid_idx
+end
+
 function _add_to_collection!(
     collection_tr3w::Vector{PSY.ThreeWindingTransformer},
     transformer_tr3w::PSY.ThreeWindingTransformer,
diff --git a/src/iterative_refinement.jl b/src/iterative_refinement.jl
new file mode 100644
index 000000000..50e01f86e
--- /dev/null
+++ b/src/iterative_refinement.jl
@@ -0,0 +1,153 @@
+import LinearAlgebra: norm, mul!
+
+"""
+    DEFAULT_REFINEMENT_MAX_ITER :: Int
+
+Default iteration cap for `solve_w_refinement!`.
+"""
+const DEFAULT_REFINEMENT_MAX_ITER = 25
+
+# --- backend dispatch bridge ---------------------------------------------------
+#
+# The refinement body uses three primitives — in-place `solve!`, `is_factored`,
+# and `_dim` — each of which is defined inside `KLUWrapper` and
+# `AccelerateWrapper` as a *separate* function. The bridge below lets the
+# shared body reach the right backend by multiple dispatch instead of an `isa`
+# branch, and keeps the body itself backend-agnostic.
+
+_refine_solve!(K::KLULinSolveCache, r::StridedVecOrMat) = solve!(K, r)
+_refine_solve!(K::AAFactorCache, r::StridedVecOrMat) = AccelerateWrapper.solve!(K, r)
+
+_refine_is_factored(K::KLULinSolveCache) = is_factored(K)
+_refine_is_factored(K::AAFactorCache) = AccelerateWrapper.is_factored(K)
+
+_refine_dim(K::KLULinSolveCache) = Int(KLUWrapper._dim(K))
+_refine_dim(K::AAFactorCache) = AccelerateWrapper._dim(K)
+
+"""
+    solve_w_refinement!(cache, A, X, B; tol=…, max_iters=…) -> X
+
+Solve `A · X = B` in place using `cache` and apply iterative refinement
+until `norm(B − A·X, 1) < norm(B, 1) * tol`, or until refinement stops
+improving. `X` must be pre-allocated by the caller with the same shape as
+`B`; it is overwritten with the refined solution.
+
+This is the non-allocating variant. For a one-shot allocating variant that
+returns a fresh `X`, see `solve_w_refinement`.
+
+Supports `cache::KLULinSolveCache` (KLU backend, any `{Tv, Ti}`) and
+`cache::AAFactorCache` (Apple Accelerate backend, `Cdouble` only). The
+cache must already be factored (`is_factored(cache) == true`). The function
+does not mutate `cache`'s factor; it only triggers in-place `solve!` calls
+on a residual buffer.
+
+`max_iters` caps the refinement loop (default
+`DEFAULT_REFINEMENT_MAX_ITER`). The default `tol` is `sqrt(eps(real(Tv)))`,
+which is conservative for power-flow Newton-Raphson Jacobians.
+
+Useful when the cached factor is of an ill-conditioned matrix — e.g.,
+a Newton-Raphson Jacobian near a saddle point or a network reduction
+interface — where a single back-solve carries enough error to slow NR
+convergence. Cost per refinement iteration: one sparse matrix-vector
+product plus one `solve!` against the cached factor.
+"""
+function solve_w_refinement!(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseArrays.SparseMatrixCSC{Tv, Ti},
+    X::StridedVecOrMat{Tv},
+    B::StridedVecOrMat{Tv};
+    tol::Real = sqrt(eps(real(Tv))),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+) where {Tv, Ti}
+    return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters)
+end
+
+function solve_w_refinement!(
+    cache::AAFactorCache,
+    A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer},
+    X::StridedVecOrMat{Cdouble},
+    B::StridedVecOrMat{Cdouble};
+    tol::Real = sqrt(eps(Cdouble)),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+)
+    return _solve_w_refinement_body!(cache, A, X, B, tol, max_iters)
+end
+
+function _solve_w_refinement_body!(
+    cache,
+    A::SparseArrays.SparseMatrixCSC,
+    X::StridedVecOrMat,
+    B::StridedVecOrMat,
+    tol::Real,
+    max_iters::Int,
+)
+    _refine_is_factored(cache) || error(
+        "solve_w_refinement!: cache must be factored. " *
+        "Call `full_factor!(cache, A)` first.",
+    )
+    size(X) == size(B) ||
+        throw(DimensionMismatch("X is $(size(X)); B is $(size(B))."))
+    size(B, 1) == _refine_dim(cache) || throw(
+        DimensionMismatch(
+            "size(B, 1) = $(size(B, 1)); cache n = $(_refine_dim(cache)).",
+        ),
+    )
+
+    Tv = eltype(X)
+    fill!(X, zero(Tv))
+    # `X = 0` ⇒ initial residual `r = B - A·0 = B`. Copy once; subsequent
+    # iterations reuse `r` in place via `mul!(r, A, X); @. r = B - r`,
+    # which avoids the temporary `A * X` allocation `B - A * X` would create
+    # each refinement step.
+    r = copy(B)
+    bNorm = norm(B, 1)
+    iters = 0
+    while iters < max_iters && norm(r, 1) >= bNorm * tol
+        prev_err = norm(r, 1)
+        _refine_solve!(cache, r)
+        X .+= r
+        mul!(r, A, X)
+        @. r = B - r
+        iters += 1
+        if norm(r, 1) > prev_err
+            @debug "Iterative refinement diverging; returning best-so-far." iters
+            return X
+        end
+    end
+    @debug "Iterative refinement converged." iters
+    return X
+end
+
+"""
+    solve_w_refinement(cache, A, B; tol=…, max_iters=…) -> X
+
+Allocating wrapper around `solve_w_refinement!`. Allocates `X` matching
+`B`'s shape, then refines.
+"""
+function solve_w_refinement(
+    cache::KLULinSolveCache{Tv, Ti},
+    A::SparseArrays.SparseMatrixCSC{Tv, Ti},
+    B::StridedVecOrMat{Tv};
+    tol::Real = sqrt(eps(real(Tv))),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+) where {Tv, Ti}
+    X = similar(B)
+    return solve_w_refinement!(
+        cache, A, X, B;
+        tol = tol, max_iters = max_iters,
+    )
+end
+
+function solve_w_refinement(
+    cache::AAFactorCache,
+    A::SparseArrays.SparseMatrixCSC{Cdouble, <:Integer},
+    B::StridedVecOrMat{Cdouble};
+    tol::Real = sqrt(eps(Cdouble)),
+    max_iters::Int = DEFAULT_REFINEMENT_MAX_ITER,
+)
+    X = similar(B)
+    return solve_w_refinement!(
+        cache, A, X, B;
+        tol = tol, max_iters = max_iters,
+    )
+end
diff --git a/src/linalg_settings.jl b/src/linalg_settings.jl
index c79962189..58428f8a5 100644
--- a/src/linalg_settings.jl
+++ b/src/linalg_settings.jl
@@ -1,7 +1,9 @@
 
-# Extensions are loaded when trigger packages (Pardiso, AppleAccelerate) are loaded
+# The MKL/Pardiso path still uses the package-extension mechanism (Pardiso.jl
+# is the only consumer-facing way to access the MKL Pardiso solver). The
+# Apple Accelerate path no longer does — `AccelerateWrapper` is built in via
+# a `@static if Sys.isapple()` gate.
 
-# Check if MKL/Pardiso extension is available at runtime
 function _has_mkl_pardiso_ext()
     ext = Base.get_extension(@__MODULE__, :MKLPardisoExt)
     return !isnothing(ext)
@@ -12,20 +14,21 @@ _mkl_pardiso_install_error() =
     Install the Pardiso package:
     julia> using Pkg; Pkg.add(\"Pardiso\")"""
 
-# Check if AppleAccelerate extension is available at runtime
-function _has_apple_accelerate_ext()
-    ext = Base.get_extension(@__MODULE__, :AppleAccelerateExt)
-    return !isnothing(ext)
-end
+_has_apple_accelerate_backend() = Sys.isapple()
 
-_apple_accelerate_install_error() =
-    """The AppleAccelerate extension is not available.
-    This solver is only available on macOS.
-    Install AppleAccelerate:
-    julia> using Pkg; Pkg.add(\"AppleAccelerate\")"""
+_apple_accelerate_unavailable_error() =
+    """The Apple Accelerate sparse backend is macOS-only (Sys.isapple() returned false).
+    Use the KLU solver (the default) on non-Apple platforms."""
 
-# _create_apple_accelerate_factorization is defined in ext/AppleAccelerateExt.jl
-# when AppleAccelerate package is loaded
+"""
+    _default_linear_solver() -> String
+
+Default sparse linear solver name. Returns "AppleAccelerate" on macOS (Apple's
+built-in libSparse via `AccelerateWrapper`) and "KLU" elsewhere. Used as the
+default for the `linear_solver` keyword on PTDF / LODF / VirtualPTDF /
+VirtualLODF / VirtualMODF constructors.
+"""
+_default_linear_solver() = Sys.isapple() ? "AppleAccelerate" : "KLU"
 
 "Set a preference of the backend library for sparse linear algebra operations."
 function set_linalg_backend_preference(linalglib::Union{String, Nothing})
@@ -125,14 +128,12 @@ function check_linalg_backend()
             end
         end
 
-        if _has_apple_accelerate_ext()
+        if _has_apple_accelerate_backend()
             @info go_msg("AppleAccelerate")
         else
             if user_linalg_backend == "AppleAccelerate"
-                @warn yo_msg("AppleAccelerate")
+                @warn "AppleAccelerate is only available on Apple platforms."
             end
-            @info no_msg("AppleAccelerate")
-            @info "See https://github.com/JuliaLinearAlgebra/AppleAccelerate.jl"
         end
     end
 end
diff --git a/src/lodf_calculations.jl b/src/lodf_calculations.jl
index 78dae430c..95754181f 100644
--- a/src/lodf_calculations.jl
+++ b/src/lodf_calculations.jl
@@ -64,6 +64,40 @@ get_network_reduction_data(M::LODF) = M.network_reduction_data
 get_arc_lookup(M::LODF) = M.lookup[1]
 stores_transpose(::LODF) = true
 
+# --- Demand-matrix short-circuit ---------------------------------------------
+#
+# The LODF computation builds a *diagonal* "demand" matrix `D = diag(m_V)`
+# where `m_V[i] = 1 - PTDF·A[i, i]` (clamped to 1.0 at `LODF_ENTRY_TOLERANCE`
+# to avoid divide-by-zero when an outage islands the line). The original
+# code factored `D` and ran a triangular solve `D · X = ptdf_denominator`;
+# that's a `factor + back-solve` over a diagonal, which collapses to
+# element-wise row scaling. KLU's BTF short-circuits this internally so the
+# overhead was modest; AA's libSparse and LAPACK's `getrf!`/`getrs!` do
+# not, so the previous code was 3–5× slower on AA and order-of-magnitude
+# slower on DENSE than necessary. Replace both with a direct row scaling.
+
+function _build_lodf_demand(ptdf_denominator::AbstractMatrix{Float64}, linecount::Int)
+    m_V = Vector{Float64}(undef, linecount)
+    @inbounds for i in 1:linecount
+        d = 1.0 - ptdf_denominator[i, i]
+        m_V[i] = d < LODF_ENTRY_TOLERANCE ? 1.0 : d
+    end
+    return m_V
+end
+
+function _apply_lodf_demand!(M::AbstractMatrix{Float64}, m_V::Vector{Float64})
+    IS.@assert_op size(M, 1) == length(m_V)
+    IS.@assert_op size(M, 1) == size(M, 2)
+    # `inv_dem .* M` mirrors what the triangular solve did internally —
+    # one reciprocal per row, then a row-wise multiply. The broadcast
+    # `M .*= inv_dem` scales each row `i` by `inv_dem[i]` because the
+    # length-n vector broadcasts down the first dimension.
+    inv_dem = 1.0 ./ m_V
+    M .*= inv_dem
+    M[SparseArrays.diagind(M)] .= -1.0
+    return M
+end
+
 function _buildlodf(
     a::SparseArrays.SparseMatrixCSC{Int8, Int},
     ptdf::Matrix{Float64},
@@ -94,7 +128,7 @@ function _buildlodf(
     ptdf::Matrix{Float64},
     ::AppleAccelerateSolver,
 )
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
     return _calculate_LODF_matrix_AppleAccelerate(a, ptdf)
 end
 
@@ -131,20 +165,8 @@ function _calculate_LODF_matrix_KLU(
     solve_sparse!(k, a_t_valid, view(first_, valid_ix, :))
     ptdf_denominator = first_' * ba
 
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator[iline, iline])
-        end
-    end
-    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
-    solve!(Dem_cache, ptdf_denominator)
-    ptdf_denominator[SparseArrays.diagind(ptdf_denominator)] .= -1.0
+    m_V = _build_lodf_demand(ptdf_denominator, linecount)
+    _apply_lodf_demand!(ptdf_denominator, m_V)
     return ptdf_denominator
 end
 
@@ -154,22 +176,9 @@ function _calculate_LODF_matrix_KLU(
 )
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_I = Int[]
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_I, iline)
-            push!(m_V, 1.0)
-        else
-            push!(m_I, iline)
-            push!(m_V, 1 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    Dem_cache = klu_factorize(SparseArrays.sparse(m_I, m_I, m_V))
+    m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
     lodf_t = copy(ptdf_denominator_t)
-    solve!(Dem_cache, lodf_t)
-    lodf_t[SparseArrays.diagind(lodf_t)] .= -1.0
-
+    _apply_lodf_demand!(lodf_t, m_V)
     return lodf_t
 end
 
@@ -179,29 +188,41 @@ function _calculate_LODF_matrix_DENSE(
 )
     linecount = size(ptdf, 2)
     ptdf_denominator_t = a * ptdf
-    m_V = Float64[]
-    for iline in 1:linecount
-        if (1.0 - ptdf_denominator_t[iline, iline]) < LODF_ENTRY_TOLERANCE
-            push!(m_V, 1.0)
-        else
-            push!(m_V, 1.0 - ptdf_denominator_t[iline, iline])
-        end
-    end
-    (mV, bipiv, binfo) = getrf!(Matrix(LinearAlgebra.diagm(m_V)))
-    _binfo_check(binfo)
-    getrs!('N', mV, bipiv, ptdf_denominator_t)
-    ptdf_denominator_t[LinearAlgebra.diagind(ptdf_denominator_t)] .= -1.0
+    m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
+    _apply_lodf_demand!(ptdf_denominator_t, m_V)
     return ptdf_denominator_t
 end
 
 # _pardiso_sequential_LODF!, _pardiso_single_LODF!, _calculate_LODF_matrix_MKLPardiso
 # are defined in ext/MKLPardisoExt.jl when the Pardiso package is loaded
 
-# _calculate_LODF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl
-# when the AppleAccelerate package is loaded
+@static if Sys.isapple()
+    """
+    Function for internal use only.
+
+    Computes the LODF matrix using the internal Apple Accelerate backend
+    (`AccelerateWrapper`). Available only on macOS. Shape mirrors
+    `_calculate_LODF_matrix_KLU(a, ptdf)` exactly: factor the diagonal "demand"
+    matrix `diag(1 - PTDF·A)` and solve in place against `a · ptdf`.
+
+    # Arguments
+    - `a::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix
+    - `ptdf::Matrix{Float64}`: PTDF matrix
+    """
+    function _calculate_LODF_matrix_AppleAccelerate(
+        a::SparseArrays.SparseMatrixCSC{Int8, Int},
+        ptdf::Matrix{Float64},
+    )
+        linecount = size(ptdf, 2)
+        ptdf_denominator_t = a * ptdf
+        m_V = _build_lodf_demand(ptdf_denominator_t, linecount)
+        _apply_lodf_demand!(ptdf_denominator_t, m_V)
+        return ptdf_denominator_t
+    end
+end
 
 """
-    LODF(sys::PSY.System; linear_solver::String = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    LODF(sys::PSY.System; linear_solver::String = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Line Outage Distribution Factor (LODF) matrix from a PowerSystems.System by computing
 the sensitivity of line flows to single line outages. This is the primary constructor for LODF
@@ -211,7 +232,7 @@ analysis starting from system data.
 - `sys::PSY.System`: The power system from which to construct the LODF matrix
 
 # Keyword Arguments
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -258,16 +279,12 @@ where A is the incidence matrix and PTDF is the power transfer distribution fact
 """
 function LODF(
     sys::PSY.System;
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     network_reductions::Vector{NetworkReduction} = NetworkReduction[],
     kwargs...,
 )
-    Ymatrix = Ybus(
-        sys;
-        network_reductions = network_reductions,
-        kwargs...,
-    )
+    Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...)
     A = IncidenceMatrix(Ymatrix)
     BA = BA_Matrix(Ymatrix)
     ptdf = PTDF(A, BA)
@@ -275,7 +292,7 @@ function LODF(
 end
 
 """
-    LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = "KLU", tol::Float64 = eps())
+    LODF(A::IncidenceMatrix, PTDFm::PTDF; linear_solver::String = _default_linear_solver(), tol::Float64 = eps())
 
 Construct a Line Outage Distribution Factor (LODF) matrix from existing incidence and PTDF matrices.
 This constructor is more efficient when the prerequisite matrices are already available.
@@ -285,7 +302,7 @@ This constructor is more efficient when the prerequisite matrices are already av
 - `PTDFm::PTDF`: The power transfer distribution factor matrix (should be non-sparsified for accuracy)
 
 # Keyword Arguments
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for the LODF matrix (not applied to input PTDF)
@@ -327,7 +344,7 @@ where:
 function LODF(
     A::IncidenceMatrix,
     PTDFm::PTDF;
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     solver = resolve_linear_solver(linear_solver)
@@ -384,7 +401,9 @@ efficient when the prerequisite matrices with factorization are already availabl
 
 # Keyword Arguments
 - `linear_solver::String = "KLU"`:
-        Linear solver algorithm for matrix computations. Currently only "KLU" is supported
+        This constructor is intentionally KLU-only because `ABA.K` is always a
+        KLU factorization. The keyword is kept for API consistency; passing any
+        other value will error.
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements
 
@@ -431,6 +450,10 @@ function LODF(
     linear_solver::String = "KLU",
     tol::Float64 = eps(),
 )
+    # NOTE: ABA.K is always a KLU factorization, so this constructor is
+    # KLU-only regardless of the `linear_solver` argument. The kwarg is kept
+    # for API consistency; passing anything other than "KLU" will error in
+    # `_buildlodf`.
     if !(
         isequal(A.network_reduction_data, BA.network_reduction_data) &&
         isequal(BA.network_reduction_data, ABA.network_reduction_data)
@@ -443,8 +466,7 @@ function LODF(
     subnetwork_axes = make_arc_arc_subnetwork_axes(A)
     ax_ref = make_ax_ref(get_arc_axis(A))
     if tol > eps()
-        lodf_t =
-            _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver)
+        lodf_t = _buildlodf(A.data, ABA.K, BA.data, Set(get_ref_bus_position(A)), solver)
         return LODF(
             sparsify(lodf_t, tol),
             (get_arc_axis(A), get_arc_axis(A)),
diff --git a/src/ptdf_calculations.jl b/src/ptdf_calculations.jl
index 132fe16b4..d6ccec700 100644
--- a/src/ptdf_calculations.jl
+++ b/src/ptdf_calculations.jl
@@ -111,7 +111,7 @@ function _buildptdf_from_matrices(
     ref_bus_positions::Set{Int},
     dist_slack::Vector{Float64},
     ::AppleAccelerateSolver)
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
     return _calculate_PTDF_matrix_AppleAccelerate(A, BA, ref_bus_positions, dist_slack)
 end
 
@@ -230,11 +230,63 @@ end
 # _calculate_PTDF_matrix_MKLPardiso is defined in ext/MKLPardisoExt.jl
 # when Pardiso package is loaded
 
-# _calculate_PTDF_matrix_AppleAccelerate is defined in ext/AppleAccelerateExt.jl
-# when AppleAccelerate package is loaded
+@static if Sys.isapple()
+    """
+    Function for internal use only.
+
+    Computes the PTDF matrix using the internal Apple Accelerate backend
+    (`AccelerateWrapper`). Available only on macOS — non-Apple callers are
+    rejected by `_create_factorization` before reaching this entry. Shape
+    mirrors `_calculate_PTDF_matrix_KLU`: factor ABA via LDLT, then solve
+    `ABA · X = BA[valid_ix, :]` via the block-packed `solve_sparse!`.
+
+    # Arguments
+    - `A::SparseArrays.SparseMatrixCSC{Int8, Int}`: Incidence Matrix
+    - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`: BA matrix
+    - `ref_bus_positions::Set{Int}`: indexes of reference slack buses
+    - `dist_slack::Vector{Float64}`: distributed-slack weights
+    """
+    function _calculate_PTDF_matrix_AppleAccelerate(
+        A::SparseArrays.SparseMatrixCSC{Int8, Int},
+        BA::SparseArrays.SparseMatrixCSC{Float64, Int},
+        ref_bus_positions::Set{Int},
+        dist_slack::Vector{Float64},
+    )
+        linecount = size(BA, 2)
+        buscount = size(BA, 1)
+        if !isempty(dist_slack) && length(ref_bus_positions) != 1
+            error(
+                "Distributed slack is not supported for systems with multiple reference buses.",
+            )
+        end
+        if !isempty(dist_slack) && length(dist_slack) != buscount
+            error("Distributed bus specification doesn't match the number of buses.")
+        end
+        length(ref_bus_positions) < buscount || error(
+            "All buses are reference buses; PTDF is not defined.",
+        )
+
+        ABA = calculate_ABA_matrix(A, BA, ref_bus_positions)
+        # ABA = Aᵀ B A is symmetric by construction; skip the structural check.
+        cache = AccelerateWrapper.aa_factorize(ABA; check_symmetry = false)
+        valid_ix = setdiff(1:buscount, ref_bus_positions)
+        PTDFm_t = zeros(buscount, linecount)
+        AccelerateWrapper.solve_sparse!(
+            cache,
+            BA[valid_ix, :],
+            view(PTDFm_t, valid_ix, :),
+        )
+
+        isempty(dist_slack) && return PTDFm_t
+
+        @info "Distributed bus"
+        slack_array = reshape(dist_slack ./ sum(dist_slack), 1, buscount)
+        return PTDFm_t .- (slack_array * PTDFm_t)
+    end
+end
 
 """
-    PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    PTDF(sys::PSY.System; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from a PowerSystems.System by computing
 the sensitivity of transmission line flows to bus power injections. This is the primary constructor
@@ -247,7 +299,7 @@ for PTDF analysis starting from system data.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling.
         Empty dictionary uses single slack bus (default behavior)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -300,7 +352,7 @@ where A is the incidence matrix and B is the susceptance matrix.
 """
 function PTDF(sys::PSY.System;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
     kwargs...,
 )
@@ -312,7 +364,7 @@ function PTDF(sys::PSY.System;
 end
 
 """
-    PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
+    PTDF(ybus::Ybus; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps(), network_reductions::Vector{NetworkReduction} = NetworkReduction[], kwargs...)
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from existing Ybus matrix.
 This constructor is more efficient when the prerequisite matrices are already available and provides
@@ -325,7 +377,7 @@ direct control over the underlying matrix computations.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack weights for realistic slack modeling.
         Empty dictionary uses single slack bus (default behavior)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -370,7 +422,7 @@ where A is the incidence matrix and B is the susceptance matrix.
 """
 function PTDF(ybus::Ybus;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     A = IncidenceMatrix(ybus)
@@ -385,7 +437,7 @@ function PTDF(ybus::Ybus;
 end
 
 """
-    PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = "KLU", tol::Float64 = eps())
+    PTDF(A::IncidenceMatrix, BA::BA_Matrix; dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(), linear_solver = _default_linear_solver(), tol::Float64 = eps())
 
 Construct a Power Transfer Distribution Factor (PTDF) matrix from existing incidence and BA matrices.
 This constructor is more efficient when the prerequisite matrices are already available and provides
@@ -399,7 +451,7 @@ direct control over the underlying matrix computations.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary mapping bus numbers to distributed slack participation factors.
         Empty dictionary uses single slack bus (reference bus from matrices)
-- `linear_solver::String = "KLU"`:
+- `linear_solver::String = _default_linear_solver()`:
         Linear solver algorithm for matrix computations. Options: "KLU", "Dense", "MKLPardiso"
 - `tol::Float64 = eps()`:
         Sparsification tolerance for dropping small matrix elements to reduce memory usage
@@ -451,7 +503,7 @@ function PTDF(
     A::IncidenceMatrix,
     BA::BA_Matrix;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver = "KLU",
+    linear_solver = _default_linear_solver(),
     tol::Float64 = eps(),
 )
     dist_slack_vector = if !(isempty(dist_slack))
diff --git a/src/solver_dispatch.jl b/src/solver_dispatch.jl
index 7a31ba86c..4998d6c97 100644
--- a/src/solver_dispatch.jl
+++ b/src/solver_dispatch.jl
@@ -10,17 +10,20 @@
 # the callback. One factor + one cache per Virtual matrix is what
 # remains — same throughput as the pool variant once the global lock
 # is in place.
+#
+# Apple's libSparse has no documented cross-handle corruption issue
+# analogous to `_LIBKLU_LOCK`; the per-cache `solver_lock` we acquire
+# here is sufficient for the `AAFactorCache` backend.
 
 """
     with_solver(f, K, work_ba_col, temp_data, solver_lock) -> result
 
 Acquire `solver_lock`, then invoke `f(K, work_ba_col[1], temp_data[1])`.
-Two overloads: one specialized on `KLULinSolveCache{Float64}` (the KLU
-backend), one generic for any other factorization (`AppleAccelerate.AAFactorization`,
-etc.). Both serialize through `solver_lock`; the per-cache scratch
-slot at index 1 is the only slot — `work_ba_col` and `temp_data` are
-single-element vectors, kept as `Vector{Vector{Float64}}` because
-`_solve_factorization` and the AppleAccelerate extension are typed on
+Three overloads: `KLULinSolveCache{Float64}` (KLU backend), `AAFactorCache`
+(Apple Accelerate backend), and a generic fallback. All serialize through
+`solver_lock`; the per-cache scratch slot at index 1 is the only slot —
+`work_ba_col` and `temp_data` are single-element vectors, kept as
+`Vector{Vector{Float64}}` because `_solve_factorization` is typed on
 `Vector{Float64}` and the two buffers have different lengths
 (`n_buses` vs. `n_buses - n_ref_buses`).
 """
@@ -34,6 +37,16 @@ function with_solver(
     return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
 end
 
+function with_solver(
+    f::F,
+    K::AAFactorCache,
+    work_ba_col::Vector{Vector{Float64}},
+    temp_data::Vector{Vector{Float64}},
+    solver_lock::ReentrantLock,
+) where {F}
+    return @lock solver_lock f(K, work_ba_col[1], temp_data[1])
+end
+
 function with_solver(
     f::F,
     K::KT,
diff --git a/src/virtual_lodf_calculations.jl b/src/virtual_lodf_calculations.jl
index d6f4be258..922272ed4 100644
--- a/src/virtual_lodf_calculations.jl
+++ b/src/virtual_lodf_calculations.jl
@@ -52,6 +52,10 @@ callers can issue requests concurrently; the libklu work runs one at a time.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot
+        path iterate the nonzeros of a `BA` column directly.
 - `temp_data::Vector{Vector{Float64}}`:
         Single-element scratch vector kept as a `Vector{Vector{Float64}}` for
         uniform `with_solver` callback signatures.
@@ -81,6 +85,7 @@ struct VirtualLODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
     axes::Ax
     lookup::L
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     temp_data::Vector{Vector{Float64}}
     cache::RowCache
     cache_lock::ReentrantLock
@@ -110,7 +115,7 @@ function Base.show(io::IO, ::MIME{Symbol("text/plain")}, array::VirtualLODF)
 end
 
 function _get_PTDF_A_diag(
-    K::KLULinSolveCache{Float64},
+    K,
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     A::SparseArrays.SparseMatrixCSC{Int8, Int},
     ref_bus_positions::Set{Int},
@@ -119,33 +124,39 @@ function _get_PTDF_A_diag(
     n_buses = size(BA, 1)
     diag_ = zeros(n_branches)
 
-    # Pre-compute valid indices (non-reference buses)
+    # Pre-compute valid indices (non-reference buses) and their inverse map.
     valid_ix = setdiff(1:n_buses, ref_bus_positions)
     n_valid = length(valid_ix)
+    bus_to_valid_idx = _build_bus_to_valid_idx(n_buses, valid_ix)
 
     # Pre-allocate work arrays for efficiency
     ba_col = zeros(n_valid)
     ptdf_row = zeros(n_buses)
+    ba_rv = SparseArrays.rowvals(BA)
+    ba_nz = SparseArrays.nonzeros(BA)
 
     # For each branch, compute PTDF row and dot with incidence column
     for i in 1:n_branches
-        # Extract BA column for valid indices
+        # Extract BA[:, i] non-zeros into ba_col at non-ref positions.
+        # Sparse-only iteration — typically 2 nonzeros per arc — avoids
+        # the O(n_valid) scan of the full bus axis.
         fill!(ba_col, 0.0)
-        for idx in 1:n_valid
-            bus_idx = valid_ix[idx]
-            ba_col[idx] = BA[bus_idx, i]
+        @inbounds for k in SparseArrays.nzrange(BA, i)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            ba_col[valid_i] = ba_nz[k]
         end
 
-        solve!(K, ba_col)
+        _solve_factorization(K, ba_col)
 
         # Map back to full bus indices
         fill!(ptdf_row, 0.0)
-        for idx in 1:n_valid
+        @inbounds for idx in 1:n_valid
             ptdf_row[valid_ix[idx]] = ba_col[idx]
         end
 
         # Compute diagonal element: sum of PTDF[i,j] * A[i,j] for all buses j
-        for j in 1:n_buses
+        @inbounds for j in 1:n_buses
             diag_[i] += ptdf_row[j] * A[i, j]
         end
     end
@@ -222,6 +233,9 @@ struct with an empty cache.
         PSY system for which the matrix is constructed
 
 # Keyword Arguments
+- `linear_solver::String = _default_linear_solver()`: Linear solver for the
+        ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to
+        "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `network_reduction::NetworkReduction`:
         Structure containing the details of the network reduction applied when computing the matrix
 - `kwargs...`:
@@ -230,6 +244,7 @@ struct with an empty cache.
 function VirtualLODF(
     sys::PSY.System;
     dist_slack::Vector{Float64} = Float64[],
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -239,6 +254,7 @@ function VirtualLODF(
     if length(dist_slack) != 0
         @info "Distributed bus"
     end
+    solver = resolve_linear_solver(linear_solver)
     Ymatrix = Ybus(
         sys;
         network_reductions = network_reductions,
@@ -253,7 +269,7 @@ function VirtualLODF(
     subnetwork_axes = make_arc_arc_subnetwork_axes(A)
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu_factorize(ABA)
+    K = _create_factorization(solver, ABA)
     bus_ax = get_bus_axis(A)
 
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
@@ -281,6 +297,7 @@ function VirtualLODF(
     # Single scratch slot — solves serialize via `solver_lock` + `_LIBKLU_LOCK`.
     temp_data = [zeros(length(bus_ax))]
     work_ba_col = [zeros(length(valid_ix))]
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
 
     return VirtualLODF(
         K,
@@ -294,6 +311,7 @@ function VirtualLODF(
         axes,
         look_up,
         valid_ix,
+        bus_to_valid_idx,
         temp_data,
         empty_cache,
         ReentrantLock(),
@@ -341,8 +359,17 @@ function _compute_lodf_row(vlodf::VirtualLODF, row::Int)::Vector{Float64}
     return with_solver(
         vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        @inbounds for i in eachindex(vlodf.valid_ix)
-            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], row]
+        # Sparse-only extraction: iterate BA[:, row] non-zeros (typically
+        # 2 per arc) instead of scanning the full bus axis.
+        fill!(work_ba_col, 0.0)
+        BA = vlodf.BA
+        bus_to_valid_idx = vlodf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, row)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
 
@@ -472,9 +499,17 @@ function _getindex_partial(
     return with_solver(
         vlodf.K, vlodf.work_ba_col, vlodf.temp_data, vlodf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        # Steps 1-2: Compute B⁻¹(b_e · ν_e) via KLU solve.
-        @inbounds for i in eachindex(vlodf.valid_ix)
-            work_ba_col[i] = vlodf.BA[vlodf.valid_ix[i], arc_idx]
+        # Steps 1-2: Compute B⁻¹(b_e · ν_e) via sparse-only BA-column
+        # extraction + solve.
+        fill!(work_ba_col, 0.0)
+        BA = vlodf.BA
+        bus_to_valid_idx = vlodf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, arc_idx)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
 
diff --git a/src/virtual_modf_calculations.jl b/src/virtual_modf_calculations.jl
index cbe596087..13c18cd9e 100644
--- a/src/virtual_modf_calculations.jl
+++ b/src/virtual_modf_calculations.jl
@@ -45,6 +45,10 @@ cache and skips the recomputation.
         Tuple of lookup dictionaries for indexing.
 - `valid_ix::Vector{Int}`:
         Indices of non-reference buses.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the
+        Woodbury kernel iterate the nonzeros of a `BA` column directly.
 - `contingency_cache::Dict{Base.UUID, ContingencySpec}`:
         Resolved contingencies keyed by outage UUID.
 - `woodbury_cache::Dict{NetworkModification, WoodburyFactors}`:
@@ -85,6 +89,7 @@ struct VirtualMODF{Ax <: NTuple{2, Vector}, L <: NTuple{2, Dict}, K} <:
     axes::Ax
     lookup::L
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     contingency_cache::Dict{Base.UUID, ContingencySpec}
     woodbury_cache::Dict{NetworkModification, WoodburyFactors}
     row_caches::Dict{NetworkModification, RowCache}
@@ -134,6 +139,7 @@ function _compute_woodbury_factors(
             mat.BA,
             mat.arc_susceptances,
             mat.valid_ix,
+            mat.bus_to_valid_idx,
             modifications,
         )
     end
@@ -157,6 +163,7 @@ function _apply_woodbury_correction(
             mat.BA,
             mat.arc_susceptances,
             mat.valid_ix,
+            mat.bus_to_valid_idx,
             monitored_idx,
             wf,
         )
@@ -206,6 +213,9 @@ Outage supplemental attributes found in the system.
 
 # Keyword Arguments
 - `dist_slack::Vector{Float64}`: Distributed slack weights (default: empty)
+- `linear_solver::String = _default_linear_solver()`: Linear solver for the
+        ABA factorization. Options: "KLU", "AppleAccelerate". Defaults to
+        "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `tol::Float64`: Tolerance for row sparsification (default: eps())
 - `max_cache_size::Int`: Max cache size in MiB per contingency (default: MAX_CACHE_SIZE_MiB)
 - `network_reductions::Vector{NetworkReduction}`: Network reductions to apply
@@ -213,6 +223,7 @@ Outage supplemental attributes found in the system.
 function VirtualMODF(
     sys::PSY.System;
     dist_slack::Vector{Float64} = Float64[],
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     network_reductions::Vector{NetworkReduction} = NetworkReduction[],
@@ -222,6 +233,7 @@ function VirtualMODF(
     if length(dist_slack) != 0
         @info "Distributed bus"
     end
+    solver = resolve_linear_solver(linear_solver)
 
     # Build network matrices (same path as VirtualLODF)
     Ymatrix = Ybus(sys; network_reductions = network_reductions, kwargs...)
@@ -238,9 +250,10 @@ function VirtualMODF(
 
     BA = BA_Matrix(Ymatrix)
     ABA = calculate_ABA_matrix(A.data, BA.data, Set(ref_bus_positions))
-    K = klu_factorize(ABA)
+    K = _create_factorization(solver, ABA)
 
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
 
     # PTDF diagonal precompute runs serially on the dispatcher thread.
     PTDF_A_diag = _get_PTDF_A_diag(K, BA.data, A.data, Set(ref_bus_positions))
@@ -264,6 +277,7 @@ function VirtualMODF(
         axes,
         look_up,
         valid_ix,
+        bus_to_valid_idx,
         Dict{Base.UUID, ContingencySpec}(),
         Dict{NetworkModification, WoodburyFactors}(),
         Dict{NetworkModification, RowCache}(),
diff --git a/src/virtual_ptdf_calculations.jl b/src/virtual_ptdf_calculations.jl
index 89b15df62..0396009d9 100644
--- a/src/virtual_ptdf_calculations.jl
+++ b/src/virtual_ptdf_calculations.jl
@@ -21,8 +21,8 @@ JuMP-side work (in callers) parallelizes freely.
 # Arguments
 - `K`:
         LU factorization of the ABA matrix. A `KLULinSolveCache{Float64}` for
-        the default KLU solver, or an `AppleAccelerate.AAFactorization{Float64}`
-        when the AppleAccelerate extension is loaded.
+        the default KLU solver, or an `AccelerateWrapper.AAFactorCache` when the
+        AppleAccelerate backend is selected on macOS.
 - `BA::SparseArrays.SparseMatrixCSC{Float64, Int}`:
         BA matrix
 - `ref_bus_positions::Set{Int}`:
@@ -48,6 +48,11 @@ JuMP-side work (in callers) parallelizes freely.
 - `valid_ix::Vector{Int}`:
         Vector containing the row/columns indices of matrices related the buses
         which are not slack ones.
+- `bus_to_valid_idx::Vector{Int}`:
+        Inverse of `valid_ix`: `bus_to_valid_idx[b]` is the position of bus
+        `b` inside `valid_ix`, or 0 if `b` is a reference bus. Lets the hot
+        path iterate the nonzeros of a `BA` column instead of scanning the
+        full bus axis.
 - `cache::RowCache`:
         Cache where PTDF rows are stored.
 - `cache_lock::ReentrantLock`:
@@ -80,6 +85,7 @@ struct VirtualPTDF{Ax, L <: NTuple{2, Dict}, K} <:
     lookup::L
     temp_data::Vector{Vector{Float64}}
     valid_ix::Vector{Int}
+    bus_to_valid_idx::Vector{Int}
     cache::RowCache
     cache_lock::ReentrantLock
     subnetwork_axes::Dict{Int, Ax}
@@ -120,8 +126,9 @@ struct with an empty cache.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary of weights to be used as distributed slack bus.
         The distributed slack dictionary must have the same number of entries as the number of buses.
-- `linear_solver::String = "KLU"`:
-        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate"
+- `linear_solver::String = _default_linear_solver()`:
+        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate".
+        Defaults to "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `tol::Float64 = eps()`:
         Tolerance related to sparsification and values to drop.
 - `max_cache_size::Int`:
@@ -136,7 +143,7 @@ struct with an empty cache.
 function VirtualPTDF(
     sys::PSY.System;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -172,8 +179,9 @@ function _create_factorization(
     ::AppleAccelerateSolver,
     ABA::SparseArrays.SparseMatrixCSC{Float64, Int},
 )
-    _has_apple_accelerate_ext() || error(_apple_accelerate_install_error())
-    return _create_apple_accelerate_factorization(ABA)
+    _has_apple_accelerate_backend() || error(_apple_accelerate_unavailable_error())
+    # ABA = Aᵀ B A is symmetric by construction; skip the structural check.
+    return AccelerateWrapper.aa_factorize(ABA; check_symmetry = false)
 end
 
 function _create_factorization(
@@ -197,8 +205,9 @@ The return is a VirtualPTDF struct with an empty cache.
 - `dist_slack::Dict{Int, Float64} = Dict{Int, Float64}()`:
         Dictionary of weights to be used as distributed slack bus.
         The distributed slack dictionary must have the same number of entries as the number of buses.
-- `linear_solver::String = "KLU"`:
-        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate"
+- `linear_solver::String = _default_linear_solver()`:
+        Linear solver to use for factorization. Options: "KLU", "AppleAccelerate".
+        Defaults to "AppleAccelerate" on macOS and "KLU" elsewhere.
 - `tol::Float64 = eps()`:
         Tolerance related to sparsification and values to drop.
 - `max_cache_size::Int`:
@@ -209,7 +218,7 @@ The return is a VirtualPTDF struct with an empty cache.
 function VirtualPTDF(
     ybus::Ybus;
     dist_slack::Dict{Int, Float64} = Dict{Int, Float64}(),
-    linear_solver::String = "KLU",
+    linear_solver::String = _default_linear_solver(),
     tol::Float64 = eps(),
     max_cache_size::Int = MAX_CACHE_SIZE_MiB,
     persistent_arcs::Vector{Tuple{Int, Int}} = Vector{Tuple{Int, Int}}(),
@@ -261,6 +270,7 @@ function VirtualPTDF(
     # `Vector{Vector{Float64}}` so `with_solver`'s callback signature
     # stays uniform across solver backends.
     valid_ix = setdiff(1:length(bus_ax), ref_bus_positions)
+    bus_to_valid_idx = _build_bus_to_valid_idx(length(bus_ax), valid_ix)
     temp_data = [zeros(length(bus_ax))]
     work_ba_col = [zeros(length(valid_ix))]
 
@@ -277,6 +287,7 @@ function VirtualPTDF(
         look_up,
         temp_data,
         valid_ix,
+        bus_to_valid_idx,
         empty_cache,
         ReentrantLock(),
         subnetwork_axes,
@@ -321,17 +332,18 @@ if isdefined(Base, :print_array) # 0.7 and later
     Base.print_array(io::IO, X::VirtualPTDF) = "VirtualPTDF"
 end
 
-# Helper function to solve with different factorization types. The
-# `KLULinSolveCache` overload solves in place (zero-allocation hot path);
-# the generic fallback delegates to `\` and is extended by the
-# AppleAccelerate extension for `AAFactorization`.
+# Helper function to solve with different factorization types. Both
+# overloads solve in place (zero-allocation hot path). The KLU and Apple
+# Accelerate backends are the only solvers supported here; adding a new
+# backend requires extending this method.
 function _solve_factorization(K::KLULinSolveCache{Float64}, b::Vector{Float64})
     solve!(K, b)
     return b
 end
 
-function _solve_factorization(K, b::Vector{Float64})
-    return K \ b
+function _solve_factorization(K::AAFactorCache, b::Vector{Float64})
+    AccelerateWrapper.solve!(K, b)
+    return b
 end
 
 function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64}
@@ -350,12 +362,23 @@ function _compute_ptdf_row(vptdf::VirtualPTDF, row::Int)::Vector{Float64}
     return with_solver(
         vptdf.K, vptdf.work_ba_col, vptdf.temp_data, vptdf.solver_lock,
     ) do K_solver, work_ba_col, temp_data
-        valid_ix = vptdf.valid_ix
-        @inbounds for i in eachindex(valid_ix)
-            work_ba_col[i] = vptdf.BA[valid_ix[i], row]
+        # Extract BA[:, row] non-zeros into work_ba_col at non-ref-bus
+        # positions. Iterates only the nonzeros of the BA column (typically
+        # 2 per arc) instead of scanning the full bus axis and bisecting
+        # the CSC for each entry.
+        fill!(work_ba_col, 0.0)
+        BA = vptdf.BA
+        bus_to_valid_idx = vptdf.bus_to_valid_idx
+        ba_rv = SparseArrays.rowvals(BA)
+        ba_nz = SparseArrays.nonzeros(BA)
+        @inbounds for k in SparseArrays.nzrange(BA, row)
+            valid_i = bus_to_valid_idx[ba_rv[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz[k]
         end
         lin_solve = _solve_factorization(K_solver, work_ba_col)
         fill!(temp_data, 0.0)
+        valid_ix = vptdf.valid_ix
         @inbounds for i in eachindex(valid_ix)
             temp_data[valid_ix[i]] = lin_solve[i]
         end
diff --git a/src/woodbury_kernel.jl b/src/woodbury_kernel.jl
index ad29ba3b4..7f183e125 100644
--- a/src/woodbury_kernel.jl
+++ b/src/woodbury_kernel.jl
@@ -83,6 +83,7 @@ function _compute_woodbury_factors_impl(
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     arc_sus::Vector{Float64},
     valid_ix::Vector{Int},
+    bus_to_valid_idx::Vector{Int},
     modifications::Tuple{Vararg{ArcModification}},
 )::WoodburyFactors
     M = length(modifications)
@@ -97,13 +98,19 @@ function _compute_woodbury_factors_impl(
 
     # Compute Z[:,j] = B⁻¹ν_j for each modified arc
     Z = Matrix{Float64}(undef, n_bus, M)
+    ba_rv_outer = SparseArrays.rowvals(BA)
+    ba_nz_outer = SparseArrays.nonzeros(BA)
 
     for (j, mod) in enumerate(modifications)
         e = mod.arc_index
         b_e = arc_sus[e]
 
-        @inbounds for i in eachindex(valid_ix)
-            work_ba_col[i] = BA[valid_ix[i], e]
+        # Sparse-only extraction of BA[:, e] into work_ba_col.
+        fill!(work_ba_col, 0.0)
+        @inbounds for k in SparseArrays.nzrange(BA, e)
+            valid_i = bus_to_valid_idx[ba_rv_outer[k]]
+            valid_i > 0 || continue
+            work_ba_col[valid_i] = ba_nz_outer[k]
         end
         lin_solve = _solve_factorization(K, work_ba_col)
 
@@ -159,6 +166,7 @@ function _apply_woodbury_correction_impl(
     BA::SparseArrays.SparseMatrixCSC{Float64, Int},
     arc_sus::Vector{Float64},
     valid_ix::Vector{Int},
+    bus_to_valid_idx::Vector{Int},
     monitored_idx::Int,
     wf::WoodburyFactors,
 )::Vector{Float64}
@@ -177,10 +185,15 @@ function _apply_woodbury_correction_impl(
         return zeros(n_bus)
     end
 
-    # z_m = B⁻¹ν_m / b_mon_pre via KLU solve on BA column
+    # z_m = B⁻¹ν_m / b_mon_pre via sparse-only BA-column extraction + solve.
     b_mon_pre = arc_sus[monitored_idx]
-    @inbounds for i in eachindex(valid_ix)
-        work_ba_col[i] = BA[valid_ix[i], monitored_idx]
+    fill!(work_ba_col, 0.0)
+    ba_rv_mon = SparseArrays.rowvals(BA)
+    ba_nz_mon = SparseArrays.nonzeros(BA)
+    @inbounds for k in SparseArrays.nzrange(BA, monitored_idx)
+        valid_i = bus_to_valid_idx[ba_rv_mon[k]]
+        valid_i > 0 || continue
+        work_ba_col[valid_i] = ba_nz_mon[k]
     end
     lin_solve = _solve_factorization(K, work_ba_col)
 
@@ -226,7 +239,8 @@ function _compute_woodbury_factors(
     ) do K_solver, work_ba_col, temp_data
         _compute_woodbury_factors_impl(
             K_solver, work_ba_col, temp_data,
-            mat.BA, mat.arc_susceptances, mat.valid_ix, modifications,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx,
+            modifications,
         )
     end
 end
@@ -241,7 +255,8 @@ function _apply_woodbury_correction(
     ) do K_solver, work_ba_col, temp_data
         _apply_woodbury_correction_impl(
             K_solver, work_ba_col, temp_data,
-            mat.BA, mat.arc_susceptances, mat.valid_ix, monitored_idx, wf,
+            mat.BA, mat.arc_susceptances, mat.valid_ix, mat.bus_to_valid_idx,
+            monitored_idx, wf,
         )
     end
 end
diff --git a/test/PowerNetworkMatricesTests.jl b/test/PowerNetworkMatricesTests.jl
index b3857fe30..d3036822c 100644
--- a/test/PowerNetworkMatricesTests.jl
+++ b/test/PowerNetworkMatricesTests.jl
@@ -21,7 +21,7 @@ import Aqua
 Aqua.test_unbound_args(PowerNetworkMatrices)
 Aqua.test_undefined_exports(PowerNetworkMatrices)
 Aqua.test_ambiguities(PowerNetworkMatrices)
-Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:AppleAccelerate, :Pardiso])
+Aqua.test_stale_deps(PowerNetworkMatrices; ignore = [:Pardiso])
 Aqua.test_deps_compat(PowerNetworkMatrices)
 Aqua.find_persistent_tasks_deps(PowerNetworkMatrices)
 Aqua.test_persistent_tasks(PowerNetworkMatrices)
diff --git a/test/runtests.jl b/test/runtests.jl
index 1afa42b7c..ae21ff04f 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,12 +1,11 @@
-# load system-specific packages before PNM: ensures relevant PNM extensions will be loaded.
+# Load system-specific packages before PNM so relevant PNM extensions
+# precompile. The Apple Accelerate backend is now built in to PNM and does
+# not need a trigger package; only the MKL/Pardiso extension is gated on a
+# `using` call.
 @static if (Sys.ARCH === :x86_64 || Sys.ARCH === :i686) && !Sys.isapple()
     import Pkg
     Pkg.add("Pardiso")
     using Pardiso
-else
-    import Pkg
-    Pkg.add("AppleAccelerate")
-    using AppleAccelerate
 end
 
 using PowerNetworkMatrices
diff --git a/test/test_accelerate_wrapper.jl b/test/test_accelerate_wrapper.jl
new file mode 100644
index 000000000..ad6d8f5a2
--- /dev/null
+++ b/test/test_accelerate_wrapper.jl
@@ -0,0 +1,188 @@
+import LinearAlgebra
+import SparseArrays
+import Random
+
+const _AA_TEST_SEED = 0xABCDEF
+
+function _random_spd(n::Int; density::Float64 = 0.05, scale::Real = 4)
+    M = SparseArrays.sprandn(n, n, density)
+    return SparseArrays.sparse(M + M' + scale * n * LinearAlgebra.I)
+end
+
+@testset "AccelerateWrapper: smoke" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+
+    cache = PNM.AccelerateWrapper.aa_factorize(ABA)
+    @test PNM.AccelerateWrapper.is_factored(cache)
+
+    b = randn(n)
+    x = copy(b)
+    PNM.AccelerateWrapper.solve!(cache, x)
+    @test isapprox(ABA * x, b; atol = 1e-9)
+
+    B = SparseArrays.sprandn(n, 120, 0.03)
+    for j in 1:5:120
+        B[:, j] .= 0
+    end
+    SparseArrays.dropzeros!(B)
+    out = zeros(n, 120)
+    PNM.AccelerateWrapper.solve_sparse!(cache, B, out)
+    @test isapprox(out, ABA \ Matrix(B); atol = 1e-9)
+end
+
+@testset "AccelerateWrapper: _create_factorization dispatch" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+
+    K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA)
+    @test K_klu isa PNM.KLULinSolveCache{Float64}
+    @test K_aa isa PNM.AAFactorCache
+
+    b_klu = randn(n)
+    b_aa = copy(b_klu)
+    PNM._solve_factorization(K_klu, b_klu)
+    PNM._solve_factorization(K_aa, b_aa)
+    @test isapprox(b_klu, b_aa; atol = 1e-9)
+end
+
+@testset "AccelerateWrapper: with_solver resolves to concrete method" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    ABA = _random_spd(n)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA)
+
+    work = [zeros(n)]
+    tmp = [zeros(n)]
+    lk = ReentrantLock()
+    b = randn(n)
+    copyto!(work[1], b)
+
+    result = PNM.with_solver(K_aa, work, tmp, lk) do K, wba, _td
+        PNM._solve_factorization(K, wba)
+        return copy(wba)
+    end
+    @test isapprox(ABA * result, b; atol = 1e-9)
+
+    m = which(
+        PNM.with_solver,
+        (
+            typeof(identity),
+            PNM.AAFactorCache,
+            Vector{Vector{Float64}},
+            Vector{Vector{Float64}},
+            ReentrantLock,
+        ),
+    )
+    @test Base.unwrap_unionall(m.sig).parameters[3] === PNM.AAFactorCache
+end
+
+@testset "AccelerateWrapper: structural-symmetry check" begin
+    PNM._has_apple_accelerate_backend() || return
+    n = 8
+    # Build a clearly non-symmetric matrix with a missing mirror entry.
+    A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n)))
+    A[2, 5] = 1.0
+    SparseArrays.dropzeros!(A)
+
+    @test_throws ArgumentError PNM.AAFactorCache(A)
+    @test_throws ArgumentError PNM.AccelerateWrapper.aa_factorize(A)
+
+    # Bypass: caller asserts symmetry; AAFactorCache returns without throwing.
+    cache = PNM.AAFactorCache(A; check_symmetry = false)
+    @test cache isa PNM.AAFactorCache
+
+    # symbolic_factor! must also honor the cache's check_symmetry flag.
+    Random.seed!(_AA_TEST_SEED)
+    ABA = _random_spd(n; density = 0.4)
+    cache_strict = PNM.AAFactorCache(ABA)
+    @test_throws ArgumentError PNM.AccelerateWrapper.symbolic_factor!(cache_strict, A)
+end
+
+@testset "AccelerateWrapper: KLU vs AA per-column solve parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 200
+    nbranches = 40
+    ABA = _random_spd(n)
+    BA = SparseArrays.sprandn(n, nbranches, 0.03)
+
+    K_klu = PNM._create_factorization(PNM.KLUSolver(), ABA)
+    K_aa = PNM._create_factorization(PNM.AppleAccelerateSolver(), ABA)
+
+    work_klu = zeros(n)
+    work_aa = zeros(n)
+    for col in 1:nbranches
+        work_klu .= Vector(BA[:, col])
+        copyto!(work_aa, work_klu)
+        PNM._solve_factorization(K_klu, work_klu)
+        PNM._solve_factorization(K_aa, work_aa)
+        @test isapprox(work_klu, work_aa; atol = 1e-9)
+    end
+end
+
+@testset "AccelerateWrapper: solve_w_refinement matches direct solve on AA cache" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED)
+    n = 60
+    ABA = _random_spd(n)
+    cache = PNM.AccelerateWrapper.aa_factorize(ABA; check_symmetry = false)
+
+    x_true = randn(n)
+    b = ABA * x_true
+    X = PNM.solve_w_refinement(cache, ABA, b)
+    @test isapprox(X, x_true; atol = 1e-10)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement! recovers ill-conditioned AA solve" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED + 1)
+    n = 80
+    # Symmetric ill-conditioned tridiagonal — back-solve leaves residual that
+    # refinement closes. Less pathological than the KLU equivalent because
+    # AA's pivoted-LDLT has a higher pivot floor than KLU's BTF + LU on a
+    # nearly-defective tridiagonal.
+    A = SparseArrays.spdiagm(
+        0 => fill(1.0, n),
+        1 => fill(1.0 - 1e-6, n - 1),
+        -1 => fill(1.0 - 1e-6, n - 1),
+    )
+    cache = PNM.AccelerateWrapper.aa_factorize(A; check_symmetry = false)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = zeros(n)
+    PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12)
+    @test isapprox(X, x_true; atol = 1e-8)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement requires a factored AA cache" begin
+    PNM._has_apple_accelerate_backend() || return
+    n = 12
+    A = SparseArrays.sparse(Float64.(LinearAlgebra.I(n)))
+    cache = PNM.AAFactorCache(A; check_symmetry = false)  # not factored
+    b = randn(n)
+    @test_throws ErrorException PNM.solve_w_refinement(cache, A, b)
+end
+
+@testset "AccelerateWrapper: solve_w_refinement KLU vs AA parity" begin
+    PNM._has_apple_accelerate_backend() || return
+    Random.seed!(_AA_TEST_SEED + 2)
+    n = 50
+    A = _random_spd(n)
+    x_true = randn(n)
+    b = A * x_true
+
+    K_klu = PNM.klu_factorize(A)
+    K_aa = PNM.AccelerateWrapper.aa_factorize(A; check_symmetry = false)
+
+    X_klu = PNM.solve_w_refinement(K_klu, A, b)
+    X_aa = PNM.solve_w_refinement(K_aa, A, b)
+    @test isapprox(X_klu, X_aa; atol = 1e-10)
+end
diff --git a/test/test_klu_wrapper.jl b/test/test_klu_wrapper.jl
index a4be6d48c..902c4a088 100644
--- a/test/test_klu_wrapper.jl
+++ b/test/test_klu_wrapper.jl
@@ -1,4 +1,6 @@
 import SparseArrays
+import LinearAlgebra
+import Random
 
 @testset "KLU wrapper: real round-trip and refactor" begin
     n = 50
@@ -232,3 +234,168 @@ end
     alloc_warm = @allocated PNM.solve_sparse!(cache, B, out; block = 32)
     @test alloc_warm < n * size(B, 2) * sizeof(Float64) ÷ 4
 end
+
+# ---------------------------------------------------------------------------
+# Int32 index-type path
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: Int32 real round-trip and refactor" begin
+    n = 50
+    rng_vals = collect(1.0:n)
+    A64 = SparseArrays.spdiagm(0 => rng_vals .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    x = collect(1.0:n)
+    b = A64 * x
+
+    cache = PNM.klu_factorize(A)
+    @test PNM.is_factored(cache)
+    @test cache isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32}
+    @test size(cache) == (n, n)
+
+    y = copy(b)
+    PNM.solve!(cache, y)
+    @test isapprox(y, x, atol = 1e-10)
+
+    # Refactor with new values, same pattern.
+    A2_64 = SparseArrays.spdiagm(0 => rng_vals .+ 2.0,
+        1 => fill(0.2, n - 1), -1 => fill(0.2, n - 1))
+    A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64)
+    x2 = randn(n)
+    b2 = A2_64 * x2
+    PNM.numeric_refactor!(cache, A2)
+    y2 = copy(b2)
+    PNM.solve!(cache, y2)
+    @test isapprox(y2, x2, atol = 1e-9)
+end
+
+@testset "KLU wrapper: Int32 vs Int64 solves agree bit-for-bit" begin
+    # KLU is deterministic; identical entries and identical permutation give
+    # identical solves regardless of the C entry point's index width.
+    Random.seed!(0xC0FFEE)
+    n = 200
+    A64 = SparseArrays.spdiagm(0 => 4.0 .+ rand(n), 1 => 0.1 .* rand(n - 1),
+        -1 => 0.1 .* rand(n - 1))
+    A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+
+    c64 = PNM.klu_factorize(A64)
+    c32 = PNM.klu_factorize(A32)
+    @test c64 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int64}
+    @test c32 isa PNM.KLUWrapper.KLULinSolveCache{Float64, Int32}
+
+    b = randn(n)
+    y64 = copy(b)
+    y32 = copy(b)
+    PNM.solve!(c64, y64)
+    PNM.solve!(c32, y32)
+    @test y64 == y32   # bit-equal
+end
+
+@testset "KLU wrapper: Int32 solve_sparse! agrees with dense reference" begin
+    Random.seed!(0xBEEF)
+    n = 80
+    A_dense = Matrix(SparseArrays.sprandn(n, n, 0.1)) + 4 * LinearAlgebra.I
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(SparseArrays.sparse(A_dense))
+    cache = PNM.klu_factorize(A)
+
+    B_dense = SparseArrays.sprandn(n, 20, 0.1)
+    B = SparseArrays.SparseMatrixCSC{Float64, Int32}(B_dense)
+
+    out = PNM.solve_sparse(cache, B)
+    @test isapprox(out, A_dense \ Matrix(B_dense); atol = 1e-9)
+end
+
+@testset "KLU wrapper: Int32 numeric_refactor! is allocation-free" begin
+    n = 100
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    cache = PNM.klu_factorize(A)
+
+    A2_64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.5,
+        1 => fill(0.07, n - 1), -1 => fill(0.07, n - 1))
+    A2 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A2_64)
+
+    # Warm up specialization on the Int32 path.
+    PNM.numeric_refactor!(cache, A2)
+    alloc = @allocated PNM.numeric_refactor!(cache, A2)
+    @test alloc == 0
+end
+
+# ---------------------------------------------------------------------------
+# Performance-knob surface
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: sort_factors!/condest!/rcond! work on both index types" begin
+    n = 60
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    A32 = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+
+    for cache in (PNM.klu_factorize(A64), PNM.klu_factorize(A32))
+        PNM.KLUWrapper.sort_factors!(cache)
+        # Subsequent solve still returns the right answer.
+        b = randn(n)
+        y = copy(b)
+        PNM.solve!(cache, y)
+        @test isapprox(y, A64 \ b, atol = 1e-10)
+
+        c = PNM.KLUWrapper.condest!(cache)
+        r = PNM.KLUWrapper.rcond!(cache)
+        @test c > 0
+        @test 0 < r <= 1.0
+    end
+end
+
+# ---------------------------------------------------------------------------
+# Iterative refinement
+# ---------------------------------------------------------------------------
+
+@testset "KLU wrapper: solve_w_refinement matches direct solve on well-conditioned A" begin
+    Random.seed!(2)
+    n = 40
+    A = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.1, n - 1), -1 => fill(0.1, n - 1))
+    cache = PNM.klu_factorize(A)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = PNM.solve_w_refinement(cache, A, b)
+    @test isapprox(X, x_true, atol = 1e-12)
+end
+
+@testset "KLU wrapper: solve_w_refinement! recovers ill-conditioned solve" begin
+    # Build a noticeably ill-conditioned tridiagonal so a single back-solve
+    # leaves residual headroom that refinement actually closes.
+    Random.seed!(3)
+    n = 50
+    A = SparseArrays.spdiagm(0 => fill(1.0, n), 1 => fill(1.0 - 1e-10, n - 1))
+    cache = PNM.klu_factorize(A)
+
+    x_true = randn(n)
+    b = A * x_true
+    X = zeros(n)
+    PNM.solve_w_refinement!(cache, A, X, b; tol = 1e-12)
+    @test isapprox(X, x_true, atol = 1e-8)
+end
+
+@testset "KLU wrapper: solve_w_refinement requires a factored cache" begin
+    n = 10
+    A = SparseArrays.spdiagm(0 => 1.0:n)
+    cache = PNM.KLUWrapper.KLULinSolveCache(A)  # not factored
+    b = randn(n)
+    @test_throws ErrorException PNM.solve_w_refinement(cache, A, b)
+end
+
+@testset "KLU wrapper: solve_w_refinement works on Int32 cache" begin
+    Random.seed!(4)
+    n = 30
+    A64 = SparseArrays.spdiagm(0 => collect(1.0:n) .+ 1.0,
+        1 => fill(0.05, n - 1), -1 => fill(0.05, n - 1))
+    A = SparseArrays.SparseMatrixCSC{Float64, Int32}(A64)
+    cache = PNM.klu_factorize(A)
+    x_true = randn(n)
+    b = A64 * x_true
+    X = PNM.solve_w_refinement(cache, A, b)
+    @test isapprox(X, x_true, atol = 1e-10)
+end
diff --git a/test/test_lodf.jl b/test/test_lodf.jl
index a5a52d614..bc7d4a6a0 100644
--- a/test/test_lodf.jl
+++ b/test/test_lodf.jl
@@ -56,7 +56,7 @@
         @test isapprox(sum(total_error3), 0.0, atol = 1e-3)
     end
 
-    if PowerNetworkMatrices._has_apple_accelerate_ext()
+    if PowerNetworkMatrices._has_apple_accelerate_backend()
         L5NS_from_ptdf4 = LODF(A, P5; linear_solver = "AppleAccelerate")
         @test getindex(L5NS_from_ptdf4, "5", "6") - -0.3071 <= 1e-4
         total_error4 = 0.0
diff --git a/test/test_network_modification.jl b/test/test_network_modification.jl
index 7bdb278ad..2b158fd4a 100644
--- a/test/test_network_modification.jl
+++ b/test/test_network_modification.jl
@@ -258,8 +258,8 @@ end
     # serial baseline. The KLU path is exercised by the threaded testsets in
     # `test/test_virtual_modf.jl`; this complements that coverage on the
     # AppleAccelerate path.
-    if !PowerNetworkMatrices._has_apple_accelerate_ext()
-        @info "Skipping: AppleAccelerate extension not loaded."
+    if !PowerNetworkMatrices._has_apple_accelerate_backend()
+        @info "Skipping: AppleAccelerate backend not available on this platform."
         return
     end
     if Threads.nthreads() < 2
diff --git a/test/test_partial_lodf.jl b/test/test_partial_lodf.jl
index 53363593f..e34eb8d0a 100644
--- a/test/test_partial_lodf.jl
+++ b/test/test_partial_lodf.jl
@@ -89,7 +89,9 @@ end
 
 @testset "Partial LODF: half-susceptance matches rebuilt ground truth" begin
     sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5")
-    vlodf = VirtualLODF(sys5)
+    # Pin KLU here: the ground-truth path below calls `PNM.solve!` directly on
+    # `vlodf.K`, which is dispatched on `KLULinSolveCache` only.
+    vlodf = VirtualLODF(sys5; linear_solver = "KLU")
     e = 1  # test arc index
     b_e = vlodf.arc_susceptances[e]
 
diff --git a/test/test_powerflow_matrix_types.jl b/test/test_powerflow_matrix_types.jl
index 870f650c5..33e634208 100644
--- a/test/test_powerflow_matrix_types.jl
+++ b/test/test_powerflow_matrix_types.jl
@@ -2,7 +2,7 @@
     sys = PSB.build_system(PSB.PSITestSystems, "c_sys5")
 
     @testset "DC_ABA_Matrix_Factorized" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             M = ABA_Matrix(sys; factorize = true)
             @test M isa PNM.DC_ABA_Matrix_Factorized
         end
@@ -14,7 +14,7 @@
     end
 
     @testset "DC_PTDF_Matrix" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             @test PTDF(sys; linear_solver = "AppleAccelerate") isa PNM.DC_PTDF_Matrix
         end
         @test PTDF(sys; linear_solver = "KLU") isa PNM.DC_PTDF_Matrix
@@ -22,7 +22,7 @@
     end
 
     @testset "DC_vPTDF_Matrix" begin
-        if PNM._has_apple_accelerate_ext()
+        if PNM._has_apple_accelerate_backend()
             @test VirtualPTDF(sys; linear_solver = "AppleAccelerate") isa
                   PNM.DC_vPTDF_Matrix
         end
diff --git a/test/test_ptdf.jl b/test/test_ptdf.jl
index 749cbf7a0..fb427491f 100644
--- a/test/test_ptdf.jl
+++ b/test/test_ptdf.jl
@@ -8,8 +8,8 @@
         @info "Skipped MKLPardiso tests (extension not loaded)"
         continue
     end
-    if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate"
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate"
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
         continue
     end
     sys5 = PSB.build_system(PSB.PSITestSystems, "c_sys5")
diff --git a/test/test_virtual_lodf.jl b/test/test_virtual_lodf.jl
index 1c70d05d6..ad4b46d58 100644
--- a/test/test_virtual_lodf.jl
+++ b/test/test_virtual_lodf.jl
@@ -147,3 +147,29 @@ end
     end
     @test test_value
 end
+
+@testset "Test Virtual LODF with Apple Accelerate" begin
+    if !PNM._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate VirtualLODF tests (backend unavailable on this platform)"
+    else
+        sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
+
+        vlodf_aa = VirtualLODF(sys; linear_solver = "AppleAccelerate")
+        vlodf_klu = VirtualLODF(sys; linear_solver = "KLU")
+
+        @test contains(string(typeof(vlodf_aa.K)), "AAFactorCache")
+        @test vlodf_klu.K isa PNM.KLULinSolveCache{Float64}
+
+        arc_axis = PNM.get_arc_axis(vlodf_aa)
+        @test arc_axis == PNM.get_arc_axis(vlodf_klu)
+        for arc in arc_axis
+            row_aa = vlodf_aa[arc, :]
+            row_klu = vlodf_klu[arc, :]
+            @test isapprox(row_aa, row_klu, atol = 1e-9)
+        end
+
+        # macOS default should resolve to AppleAccelerate.
+        vlodf_default = VirtualLODF(sys)
+        @test contains(string(typeof(vlodf_default.K)), "AAFactorCache")
+    end
+end
diff --git a/test/test_virtual_modf.jl b/test/test_virtual_modf.jl
index 4dc9464c8..d97c72ae0 100644
--- a/test/test_virtual_modf.jl
+++ b/test/test_virtual_modf.jl
@@ -436,6 +436,39 @@ end
     end
 end
 
+@testset "VirtualMODF with Apple Accelerate backend matches KLU" begin
+    if !PNM._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate VirtualMODF tests (backend unavailable on this platform)"
+    else
+        sys, _ = _build_c_sys14_with_outages()
+
+        vmodf_aa = VirtualMODF(sys; linear_solver = "AppleAccelerate")
+        vmodf_klu = VirtualMODF(sys; linear_solver = "KLU")
+
+        # Factorization should be the AA cache type.
+        @test contains(string(typeof(vmodf_aa.K)), "AAFactorCache")
+        @test vmodf_klu.K isa PNM.KLULinSolveCache{Float64}
+
+        registered_aa = get_registered_contingencies(vmodf_aa)
+        registered_klu = get_registered_contingencies(vmodf_klu)
+        @test !isempty(registered_aa)
+        @test keys(registered_aa) == keys(registered_klu)
+
+        # Compare post-contingency rows for every registered contingency
+        # against the KLU build, sweeping all monitored arcs.
+        arc_axis = PNM.get_arc_axis(vmodf_aa)
+        @test arc_axis == PNM.get_arc_axis(vmodf_klu)
+        for (uuid, ctg_aa) in registered_aa
+            ctg_klu = registered_klu[uuid]
+            for arc in arc_axis
+                row_aa = vmodf_aa[arc, ctg_aa]
+                row_klu = vmodf_klu[arc, ctg_klu]
+                @test isapprox(row_aa, row_klu, atol = 1e-9)
+            end
+        end
+    end
+end
+
 @testset "VirtualMODF concurrent getindex on the SAME (arc, ctg) is consistent" begin
     # Complements the previous testset: there, each (arc, ctg) pair appears
     # once in the work list, so only the `woodbury_cache` first-call race is
diff --git a/test/test_virtual_ptdf.jl b/test/test_virtual_ptdf.jl
index 5993d4d03..83d7ebb9e 100644
--- a/test/test_virtual_ptdf.jl
+++ b/test/test_virtual_ptdf.jl
@@ -1,8 +1,8 @@
 # if it fails, we don't want the terminal to be flooded with errors, therefore failfast=true
 @testset "Test Virtual PTDF matrices" for solver in
                                           ("KLU", "AppleAccelerate")
-    if !PowerNetworkMatrices._has_apple_accelerate_ext() && solver == "AppleAccelerate"
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend() && solver == "AppleAccelerate"
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
         continue
     end
     sys = PSB.build_system(
@@ -185,8 +185,8 @@ end
 end
 
 @testset "Test Virtual PTDF with Apple Accelerate" begin
-    if !PowerNetworkMatrices._has_apple_accelerate_ext()
-        @info "Skipped AppleAccelerate tests (extension not loaded)"
+    if !PowerNetworkMatrices._has_apple_accelerate_backend()
+        @info "Skipped AppleAccelerate tests (backend unavailable on this platform)"
     else
         # Test VirtualPTDF with Apple Accelerate solver
         sys = PSB.build_system(PSB.PSITestSystems, "c_sys14")
@@ -194,8 +194,8 @@ end
         # Create VirtualPTDF with AppleAccelerate
         vptdf_aa = VirtualPTDF(sys; linear_solver = "AppleAccelerate")
 
-        # Verify the factorization type is AAFactorization
-        @test contains(string(typeof(vptdf_aa.K)), "AAFactorization")
+        # Verify the factorization type is AAFactorCache
+        @test contains(string(typeof(vptdf_aa.K)), "AAFactorCache")
 
         # Create reference VirtualPTDF with KLU
         vptdf_klu = VirtualPTDF(sys; linear_solver = "KLU")
@@ -209,7 +209,7 @@ end
 
         # Test with tolerance
         vptdf_aa_tol = VirtualPTDF(sys; linear_solver = "AppleAccelerate", tol = 1e-2)
-        @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorization")
+        @test contains(string(typeof(vptdf_aa_tol.K)), "AAFactorCache")
 
         # Test with distributed slack
         buscount = length(PSY.get_available_components(PSY.ACBus, sys))
@@ -217,7 +217,7 @@ end
         dist_slack = Dict(i => dist_slack_factor for i in 1:buscount)
         vptdf_aa_slack =
             VirtualPTDF(sys; linear_solver = "AppleAccelerate", dist_slack = dist_slack)
-        @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorization")
+        @test contains(string(typeof(vptdf_aa_slack.K)), "AAFactorCache")
 
         # Compare with KLU for distributed slack case
         vptdf_klu_slack = VirtualPTDF(sys; linear_solver = "KLU", dist_slack = dist_slack)