diff --git a/Project.toml b/Project.toml index ebf03c388..556697539 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54" version = "0.18.11" [deps] +ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" @@ -23,6 +24,19 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" +libblastrampoline_jll = "8e850b90-86db-534c-a0d3-1478176c7d93" +libcoreblas_jll = "339d4f0c-89b5-5ae2-b52c-218a0e582e15" + +[weakdeps] +Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" + +[extensions] +GraphVizExt = "GraphViz" +GraphVizSimpleExt = "Colors" +PlotsExt = ["DataFrames", "Plots"] [compat] Colors = "0.12" @@ -43,19 +57,8 @@ TaskLocalValues = "0.1" TimespanLogging = "0.1" julia = "1.8" -[extensions] -GraphVizExt = "GraphViz" -GraphVizSimpleExt = "Colors" -PlotsExt = ["DataFrames", "Plots"] - [extras] Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" - -[weakdeps] -Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" diff --git a/src/Dagger.jl b/src/Dagger.jl index 97f7dd44d..2cdc61498 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -7,12 +7,12 @@ import SparseArrays: sprand, SparseMatrixCSC import MemPool import MemPool: DRef, FileRef, poolget, poolset -import Base: collect, reduce +import Base: collect, reduce, require_one_based_indexing import Distributed import Distributed: Future, RemoteChannel, myid, workers, nworkers, procs, remotecall, remotecall_wait, remotecall_fetch import LinearAlgebra -import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric +import LinearAlgebra: Adjoint, BLAS, Diagonal, Bidiagonal, Tridiagonal, LAPACK, LowerTriangular, PosDefException, Transpose, UpperTriangular, UnitLowerTriangular, UnitUpperTriangular, diagind, ishermitian, issymmetric, chkstride1 import UUIDs: UUID, uuid4 @@ -77,9 +77,14 @@ include("array/setindex.jl") include("array/matrix.jl") include("array/sparse_partition.jl") include("array/sort.jl") + +# Linear algebra include("array/linalg.jl") include("array/mul.jl") +include("array/trapezoidal.jl") +include("array/triangular.jl") include("array/cholesky.jl") +include("array/qr.jl") # Visualization include("visualization.jl") diff --git a/src/array/alloc.jl b/src/array/alloc.jl index 5b881bf0d..4155f0c6f 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -4,7 +4,8 @@ export partition mutable struct AllocateArray{T,N} <: ArrayOp{T,N} eltype::Type{T} - f::Function + f + want_index::Bool domain::ArrayDomain{N} domainchunks partitioning::AbstractBlocks @@ -23,17 +24,45 @@ function partition(p::AbstractBlocks, dom::ArrayDomain) map(_cumlength, map(length, indexes(dom)), p.blocksize)) end +function allocate_array(f, T, idx, sz) + new_f = allocate_array_func(thunk_processor(), f) + return new_f(idx, T, sz) +end +function allocate_array(f, T, sz) + new_f = allocate_array_func(thunk_processor(), f) + return new_f(T, sz) +end +allocate_array_func(::Processor, f) = f function stage(ctx, a::AllocateArray) - alloc(idx, sz) = a.f(idx, a.eltype, sz) - thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(a.domainchunks)] + if a.want_index + thunks = [Dagger.@spawn allocate_array(a.f, a.eltype, i, size(x)) for (i, x) in enumerate(a.domainchunks)] + else + thunks = [Dagger.@spawn allocate_array(a.f, a.eltype, size(x)) for (i, x) in enumerate(a.domainchunks)] + end return DArray(a.eltype, a.domain, a.domainchunks, thunks, a.partitioning) end const BlocksOrAuto = Union{Blocks{N} where N, AutoBlocks} +function DArray{T}(::UndefInitializer, p::Blocks, dims::Dims) where {T} + d = ArrayDomain(map(x->1:x, dims)) + part = partition(p, d) + f = function (T, sz) + Array{T, length(sz)}(undef, sz...) + end + a = AllocateArray(T, f, false, d, part, p) + return _to_darray(a) +end + +DArray(::UndefInitializer, p::BlocksOrAuto, dims::Integer...) = DArray{Float64}(undef, p, dims) +DArray(::UndefInitializer, p::BlocksOrAuto, dims::Dims) = DArray{Float64}(undef, p, dims) +DArray{T}(::UndefInitializer, p::BlocksOrAuto, dims::Integer...) where {T} = DArray{T}(undef, p, dims) +DArray{T}(::UndefInitializer, p::BlocksOrAuto, dims::Dims) where {T} = DArray{T}(undef, p, dims) +DArray{T}(::UndefInitializer, p::AutoBlocks, dims::Dims) where {T} = DArray{T}(undef, auto_blocks(dims), dims) + function Base.rand(p::Blocks, eltype::Type, dims::Dims) d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(eltype, (_, x...) -> rand(x...), d, partition(p, d), p) + a = AllocateArray(eltype, rand, false, d, partition(p, d), p) return _to_darray(a) end Base.rand(p::BlocksOrAuto, T::Type, dims::Integer...) = rand(p, T, dims) @@ -45,7 +74,7 @@ Base.rand(::AutoBlocks, eltype::Type, dims::Dims) = function Base.randn(p::Blocks, eltype::Type, dims::Dims) d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(eltype, (_, x...) -> randn(x...), d, partition(p, d), p) + a = AllocateArray(eltype, randn, false, d, partition(p, d), p) return _to_darray(a) end Base.randn(p::BlocksOrAuto, T::Type, dims::Integer...) = randn(p, T, dims) @@ -57,7 +86,7 @@ Base.randn(::AutoBlocks, eltype::Type, dims::Dims) = function sprand(p::Blocks, eltype::Type, dims::Dims, sparsity::AbstractFloat) d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(eltype, (_, T, _dims) -> sprand(T, _dims..., sparsity), d, partition(p, d), p) + a = AllocateArray(eltype, (T, _dims) -> sprand(T, _dims..., sparsity), false, d, partition(p, d), p) return _to_darray(a) end sprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...) = @@ -73,7 +102,7 @@ sprand(::AutoBlocks, eltype::Type, dims::Dims, sparsity::AbstractFloat) = function Base.ones(p::Blocks, eltype::Type, dims::Dims) d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(eltype, (_, x...) -> ones(x...), d, partition(p, d), p) + a = AllocateArray(eltype, ones, false, d, partition(p, d), p) return _to_darray(a) end Base.ones(p::BlocksOrAuto, T::Type, dims::Integer...) = ones(p, T, dims) @@ -85,7 +114,7 @@ Base.ones(::AutoBlocks, eltype::Type, dims::Dims) = function Base.zeros(p::Blocks, eltype::Type, dims::Dims) d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(eltype, (_, x...) -> zeros(x...), d, partition(p, d), p) + a = AllocateArray(eltype, zeros, false, d, partition(p, d), p) return _to_darray(a) end Base.zeros(p::BlocksOrAuto, T::Type, dims::Integer...) = zeros(p, T, dims) diff --git a/src/array/coreblas/coreblas_gemm.jl b/src/array/coreblas/coreblas_gemm.jl new file mode 100644 index 000000000..ad77be40d --- /dev/null +++ b/src/array/coreblas/coreblas_gemm.jl @@ -0,0 +1,21 @@ +using libblastrampoline_jll +using LinearAlgebra +using libcoreblas_jll + +for (gemm, T) in + ((:coreblas_dgemm, Float64), + (:coreblas_sgemm, Float32), + (:coreblas_cgemm, ComplexF32), + (:coreblas_zgemm, ComplexF64)) + @eval begin + function coreblas_gemm!(transa::Int64, transb::Int64, + alpha::$T, A::AbstractMatrix{$T}, B::AbstractMatrix{$T}, beta::$T, C::AbstractMatrix{$T}) + m, k = size(A) + k, n = size(B) + ccall(($gemm, "libcoreblas.so"), Cvoid, + (Int64, Int64, Int64, Int64, Int64, $T, Ptr{$T}, Int64, Ptr{$T}, Int64, + $T, Ptr{$T}, Int64), + transa, transb, m, n, k, alpha, A, m, B, k, beta, C, m) + end + end +end diff --git a/src/array/coreblas/coreblas_geqrt.jl b/src/array/coreblas/coreblas_geqrt.jl new file mode 100644 index 000000000..6b3ac01f7 --- /dev/null +++ b/src/array/coreblas/coreblas_geqrt.jl @@ -0,0 +1,32 @@ +for (geqrt, T) in + ((:coreblas_dgeqrt, Float64), + (:coreblas_sgeqrt, Float32), + (:coreblas_cgeqrt, ComplexF32), + (:coreblas_zgeqrt, ComplexF64)) + @eval begin + function coreblas_geqrt!(A::AbstractMatrix{$T}, + Tau::AbstractMatrix{$T}) + require_one_based_indexing(A, Tau) + chkstride1(A) + m, n = size(A) + ib, nb = size(Tau) + lda = max(1, stride(A,2)) + ldt = max(1, stride(Tau,2)) + work = Vector{$T}(undef, (ib)*n) + ttau = Vector{$T}(undef, n) + + err = ccall(($(QuoteNode(geqrt)), :libcoreblas), Int64, + (Int64, Int64, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Ptr{$T}), + m, n, ib, + A, lda, + Tau, ldt, + ttau, work) + if err != 0 + throw(ArgumentError("coreblas_geqrt failed. Error number: $err")) + end + end + end +end + diff --git a/src/array/coreblas/coreblas_ormqr.jl b/src/array/coreblas/coreblas_ormqr.jl new file mode 100644 index 000000000..cba7237e0 --- /dev/null +++ b/src/array/coreblas/coreblas_ormqr.jl @@ -0,0 +1,45 @@ +for (geormqr, T) in + ((:coreblas_dormqr, Float64), + (:coreblas_sormqr, Float32), + (:coreblas_zunmqr, ComplexF64), + (:coreblas_cunmqr, ComplexF32)) + @eval begin + function coreblas_ormqr!(side::Char, trans::Char, A::AbstractMatrix{$T}, + Tau::AbstractMatrix{$T}, C::AbstractMatrix{$T}) + + m, n = size(C) + ib, nb = size(Tau) + k = nb + if $T <: Complex + transnum = trans == 'N' ? 111 : 113 + else + transnum = trans == 'N' ? 111 : 112 + end + sidenum = side == 'L' ? 141 : 142 + + lda = max(1, stride(A,2)) + ldt = max(1, stride(Tau,2)) + ldc = max(1, stride(C,2)) + ldwork = side == 'L' ? n : m + work = Vector{$T}(undef, ib*nb) + + + err = ccall(($(QuoteNode(geormqr)), :libcoreblas), Int64, + (Int64, Int64, Int64, Int64, + Int64, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64), + sidenum, transnum, + m, n, + k, ib, + A, lda, + Tau, ldt, + C, ldc, + work, ldwork) + if err != 0 + throw(ArgumentError("coreblas_ormqr failed. Error number: $err")) + end + end + end +end + diff --git a/src/array/coreblas/coreblas_tsmqr.jl b/src/array/coreblas/coreblas_tsmqr.jl new file mode 100644 index 000000000..df92c555a --- /dev/null +++ b/src/array/coreblas/coreblas_tsmqr.jl @@ -0,0 +1,49 @@ +for (getsmqr, T) in + ((:coreblas_dtsmqr, Float64), + (:coreblas_ctsmqr, ComplexF32), + (:coreblas_ztsmqr, ComplexF64), + (:coreblas_stsmqr, Float32)) + @eval begin + function coreblas_tsmqr!(side::Char, trans::Char, A1::AbstractMatrix{$T}, + A2::AbstractMatrix{$T}, V::AbstractMatrix{$T}, Tau::AbstractMatrix{$T}) + m1, n1 = size(A1) + m2, n2 = size(A2) + ib, nb = size(Tau) + k = nb + + if $T <: Complex + transnum = trans == 'N' ? 111 : 113 + else + transnum = trans == 'N' ? 111 : 112 + end + + sidenum = side == 'L' ? 141 : 142 + + lda1 = max(1, stride(A1,2)) + lda2 = max(1, stride(A2,2)) + ldv = max(1, stride(V,2)) + ldt = max(1, stride(Tau,2)) + ldwork = side == 'L' ? ib : m1 + work = Vector{$T}(undef, ib*nb) + + + err = ccall(($(QuoteNode(getsmqr)), :libcoreblas), Int64, + (Int64, Int64, Int64, Int64, + Int64, Int64, Int64, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, Ptr{$T}, Int64), + sidenum, transnum, + m1, n1, + m2, n2, + k, ib, + A1, lda1, + A2, lda2, + V, ldv, + Tau, ldt, + work, ldwork) + if err != 0 + throw(ArgumentError("coreblas_tsmqr failed. Error number: $err")) + end + end + end +end diff --git a/src/array/coreblas/coreblas_tsqrt.jl b/src/array/coreblas/coreblas_tsqrt.jl new file mode 100644 index 000000000..5b3fb4ea7 --- /dev/null +++ b/src/array/coreblas/coreblas_tsqrt.jl @@ -0,0 +1,35 @@ + +for (getsqrt,T) in + ((:coreblas_dtsqrt, Float64), + (:coreblas_stsqrt, Float32), + (:coreblas_ctsqrt, ComplexF32), + (:coreblas_ztsqrt, ComplexF64)) + @eval begin + function coreblas_tsqrt!(A1::AbstractMatrix{$T}, A2::AbstractMatrix{$T}, + Tau::AbstractMatrix{$T}) + m = size(A2)[1] + n = size(A1)[2] + ib, nb = size(Tau) + lda1 = max(1, stride(A1,2)) + lda2 = max(1, stride(A2,2)) + ldt = max(1, stride(Tau,2)) + work = Vector{$T}(undef, (ib)*n) + ttau = Vector{$T}(undef, n) + + err = ccall(($(QuoteNode(getsqrt)), :libcoreblas), Int64, + (Int64, Int64, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64, Ptr{$T}, Ptr{$T}), + m, n, ib, + A1, lda1, + A2, lda2, + Tau, ldt, + ttau, work) + if err != 0 + throw(ArgumentError("coreblas_tsqrt failed. Error number: $err")) + end + end + end +end + + diff --git a/src/array/coreblas/coreblas_ttmqr.jl b/src/array/coreblas/coreblas_ttmqr.jl new file mode 100644 index 000000000..6cd9002b5 --- /dev/null +++ b/src/array/coreblas/coreblas_ttmqr.jl @@ -0,0 +1,50 @@ +for (gettmqr, T) in + ((:coreblas_dttmqr, Float64), + (:coreblas_sttmqr, Float32), + (:coreblas_cttmqr, ComplexF32), + (:coreblas_zttmqr, ComplexF64)) + @eval begin + function coreblas_ttmqr!(side::Char, trans::Char, A1::AbstractMatrix{$T}, + A2::AbstractMatrix{$T}, V::AbstractMatrix{$T}, Tau::AbstractMatrix{$T}) + m1, n1 = size(A1) + m2, n2 = size(A2) + ib, nb = size(Tau) + k=nb + if $T <: Complex + transnum = trans == 'N' ? 111 : 113 + else + transnum = trans == 'N' ? 111 : 112 + end + + sidenum = side == 'L' ? 141 : 142 + + ldv = max(1, stride(V,2)) + ldt = max(1, stride(Tau,2)) + lda1 = max(1, stride(A1,2)) + lda2 = max(1, stride(A2,2)) + ldwork = side == 'L' ? max(1,ib) : max(1,m1) + workdim = side == 'L' ? n1 : ib + work = Vector{$T}(undef, ldwork*workdim) + + err = ccall(($(QuoteNode(gettmqr)), :libcoreblas), Int64, + (Int64, Int64, Int64, Int64, + Int64, Int64, Int64, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64), + sidenum, transnum, + m1, n1, + m2, n2, + k, ib, + A1, lda1, + A2, lda2, + V, ldv, + Tau, ldt, + work, ldwork) + if err != 0 + throw(ArgumentError("coreblas_ttmqr, failed. Error number: $err")) + end + end + end +end + diff --git a/src/array/coreblas/coreblas_ttqrt.jl b/src/array/coreblas/coreblas_ttqrt.jl new file mode 100644 index 000000000..19465efbb --- /dev/null +++ b/src/array/coreblas/coreblas_ttqrt.jl @@ -0,0 +1,32 @@ + +for (gettqrt, T) in + ((:coreblas_dttqrt, Float64), + (:coreblas_sttqrt, Float32), + (:coreblas_cttqrt, ComplexF32), + (:coreblas_zttqrt, ComplexF64)) + @eval begin + function coreblas_ttqrt!(A1::AbstractMatrix{$T}, + A2::AbstractMatrix{$T}, triT::AbstractMatrix{$T}) + m1, n1 = size(A1) + m2, n2 = size(A2) + ib, nb = size(triT) + + lwork = nb + ib*nb + tau = Vector{$T}(undef, nb) + work = Vector{$T}(undef, (ib+1)*nb) + lda1 = max(1, stride(A1, 2)) + lda2 = max(1, stride(A2, 2)) + ldt = max(1, stride(triT, 2)) + + + err = ccall(($(QuoteNode(gettqrt)), :libcoreblas), Int64, + (Int64, Int64, Int64, Ptr{$T}, Int64, Ptr{$T}, Int64, + Ptr{$T}, Int64, Ptr{$T}, Ptr{$T}), + m1, n1, ib, A1, lda1, A2, lda2, triT, ldt, tau, work) + + if err != 0 + throw(ArgumentError("coreblas_ttqrt failed. Error number: $err")) + end + end + end +end diff --git a/src/array/darray.jl b/src/array/darray.jl index d03065063..36a673c4b 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -65,6 +65,10 @@ ArrayDomain((1:15), (1:80)) alignfirst(a::ArrayDomain) = ArrayDomain(map(r->1:length(r), indexes(a))) +alignfirst(a::CartesianIndices{N}) where N = + ArrayDomain(map(r->1:length(r), a.indices)) + + function size(a::ArrayDomain, dim) idxs = indexes(a) length(idxs) < dim ? 1 : length(idxs[dim]) @@ -306,16 +310,12 @@ function Base.isequal(x::ArrayOp, y::ArrayOp) x === y end -function Base.similar(x::DArray{T,N}) where {T,N} - alloc(idx, sz) = Array{T,N}(undef, sz) - thunks = [Dagger.@spawn alloc(i, size(x)) for (i, x) in enumerate(x.subdomains)] - return DArray(T, x.domain, x.subdomains, thunks, x.partitioning, x.concat) -end - +struct AllocateUndef{S} end +(::AllocateUndef{S})(T, dims::Dims{N}) where {S,N} = Array{S,N}(undef, dims) function Base.similar(A::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} d = ArrayDomain(map(x->1:x, dims)) p = A.partitioning - a = AllocateArray(S, (_, _, x...) -> Array{S,N}(undef, x...), d, partition(p, d), p) + a = AllocateArray(S, AllocateUndef{S}(), false, d, partition(p, d), p) return _to_darray(a) end @@ -331,11 +331,11 @@ Base.:(/)(x::DArray{T,N,B,F}, y::U) where {T<:Real,U<:Real,N,B,F} = A `view` of a `DArray` chunk returns a `DArray` of `Thunk`s. """ -function Base.view(c::DArray, d) +#=function Base.view(c::DArray, d) subchunks, subdomains = lookup_parts(c, chunks(c), domainchunks(c), d) d1 = alignfirst(d) DArray(eltype(c), d1, subdomains, subchunks, c.partitioning, c.concat) -end +end=# function group_indices(cumlength, idxs,at=1, acc=Any[]) at > length(idxs) && return acc @@ -370,7 +370,7 @@ function group_indices(cumlength, idxs::AbstractRange) end _cumsum(x::AbstractArray) = length(x) == 0 ? Int[] : cumsum(x) -function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}) where N +function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{N}; slice::Bool=false) where N groups = map(group_indices, subdmns.cumlength, indexes(d)) sz = map(length, groups) pieces = Array{Any}(undef, sz) @@ -378,21 +378,31 @@ function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d: idx_and_dmn = map(getindex, groups, i.I) idx = map(x->x[1], idx_and_dmn) dmn = ArrayDomain(map(x->x[2], idx_and_dmn)) - pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn)) + if slice + pieces[i] = Dagger.@spawn getindex(ps[idx...], project(subdmns[idx...], dmn)) + else + pieces[i] = Dagger.@spawn view(ps[idx...], project(subdmns[idx...], dmn).indexes...) + end end out_cumlength = map(g->_cumsum(map(x->length(x[2]), g)), groups) out_dmn = DomainBlocks(ntuple(x->1,Val(N)), out_cumlength) return pieces, out_dmn end -function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{S}) where {N,S} +function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d::ArrayDomain{S}; slice::Bool=false) where {N,S} if S != 1 throw(BoundsError(A, d.indexes)) end inds = CartesianIndices(A)[d.indexes...] new_d = ntuple(i->first(inds).I[i]:last(inds).I[i], N) - return lookup_parts(A, ps, subdmns, ArrayDomain(new_d)) + return lookup_parts(A, ps, subdmns, ArrayDomain(new_d); slice) +end + +function lookup_parts(A::DArray, ps::AbstractArray, subdmns::DomainBlocks{N}, d::CartesianIndices; slice::Bool=false) where N + return lookup_parts(A, ps, subdmns, ArrayDomain(d.indices); slice) end + + """ Base.fetch(c::DArray) diff --git a/src/array/indexing.jl b/src/array/indexing.jl index 82f44fbff..505e04f87 100644 --- a/src/array/indexing.jl +++ b/src/array/indexing.jl @@ -10,6 +10,16 @@ end GetIndex(input::ArrayOp, idx::Tuple) = GetIndex{eltype(input), ndims(input)}(input, idx) +function flatten(subdomains, subchunks, partitioning) + valdim = findfirst(j -> j != 1:1, subdomains[1].indexes) + flatc = [] + flats = Array{ArrayDomain{1, Tuple{UnitRange{Int64}}}}(undef, 0) + map(x -> push!(flats, ArrayDomain(x.indexes[valdim])), subdomains) + map(x -> push!(flatc, x), subchunks) + newb = Blocks(partitioning.blocksize[valdim]) + return flats, flatc, newb +end + function stage(ctx::Context, gidx::GetIndex) inp = stage(ctx, gidx.input) @@ -21,7 +31,14 @@ function stage(ctx::Context, gidx::GetIndex) end for i in 1:length(gidx.idx)] # Figure out output dimension - view(inp, ArrayDomain(idxs)) + d = ArrayDomain(idxs) + subchunks, subdomains = Dagger.lookup_parts(inp, chunks(inp), domainchunks(inp), d; slice = true) + d1 = alignfirst(d) + newb = inp.partitioning + if ndims(d1) != ndims(subdomains) + subdomains, subchunks, newb = flatten(subdomains, subchunks, inp.partitioning) + end + DArray(eltype(inp), d1, subdomains, subchunks, newb) end function size(x::GetIndex) diff --git a/src/array/mul.jl b/src/array/mul.jl index 4cdc7a525..44e5ab948 100644 --- a/src/array/mul.jl +++ b/src/array/mul.jl @@ -109,8 +109,8 @@ function gemm_dagger!( Bmt, Bnt = size(Bc) Cmt, Cnt = size(Cc) - alpha = _add.alpha - beta = _add.beta + alpha = T(_add.alpha) + beta = T(_add.beta) if Ant != Bmt throw(DimensionMismatch(lazy"A has number of blocks ($Amt,$Ant) but B has number of blocks ($Bmt,$Bnt)")) @@ -212,8 +212,8 @@ function syrk_dagger!( Amt, Ant = size(Ac) Cmt, Cnt = size(Cc) - alpha = _add.alpha - beta = _add.beta + alpha = T(_add.alpha) + beta = T(_add.beta) uplo = 'U' if Ant != Cmt @@ -233,7 +233,7 @@ function syrk_dagger!( Dagger.@spawn BLAS.herk!( uplo, trans, - alpha, + real(alpha), In(Ac[n, k]), mzone, InOut(Cc[n, n]), diff --git a/src/array/qr.jl b/src/array/qr.jl new file mode 100644 index 000000000..249b9b560 --- /dev/null +++ b/src/array/qr.jl @@ -0,0 +1,236 @@ +export geqrf!, porgqr!, pormqr!, cageqrf! +import LinearAlgebra: QRCompactWY, AdjointQ, BlasFloat, QRCompactWYQ, AbstractQ, StridedVecOrMat, I +import Base.:* +include("coreblas/coreblas_ormqr.jl") +include("coreblas/coreblas_ttqrt.jl") +include("coreblas/coreblas_ttmqr.jl") +include("coreblas/coreblas_geqrt.jl") +include("coreblas/coreblas_tsqrt.jl") +include("coreblas/coreblas_tsmqr.jl") + +(*)(Q::QRCompactWYQ{T, M}, b::Number) where {T<:Number, M<:DMatrix{T}} = DMatrix(Q) * b +(*)(b::Number, Q::QRCompactWYQ{T, M}) where {T<:Number, M<:DMatrix{T}} = DMatrix(Q) * b + +(*)(Q::AdjointQ{T, QRCompactWYQ{T, M, C}}, b::Number) where {T<:Number, M<:DMatrix{T}, C<:LowerTrapezoidal{T, M}} = DMatrix(Q) * b +(*)(b::Number, Q::AdjointQ{T, QRCompactWYQ{T, M, C}}) where {T<:Number, M<:DMatrix{T}, C<:LowerTrapezoidal{T, M}} = DMatrix(Q) * b + +LinearAlgebra.lmul!(B::QRCompactWYQ{T, M}, A::M) where {T, M<:DMatrix{T}} = pormqr!('L', 'N', B.factors, B.T, A) +function LinearAlgebra.lmul!(B::AdjointQ{T, <:QRCompactWYQ{T, M}}, A::M) where {T, M<:Dagger.DMatrix{T}} + trans = T <: Complex ? 'C' : 'T' + pormqr!('L', trans, B.Q.factors, B.Q.T, A) +end + +LinearAlgebra.rmul!(A::Dagger.DMatrix{T}, B::QRCompactWYQ{T, M}) where {T, M<:Dagger.DMatrix{T}} = pormqr!('R', 'N', B.factors, B.T, A) +function LinearAlgebra.rmul!(A::Dagger.DArray{T,2}, B::AdjointQ{T, <:QRCompactWYQ{T, M}}) where {T, M<:Dagger.DMatrix{T}} + trans = T <: Complex ? 'C' : 'T' + pormqr!('R', trans, B.Q.factors, B.Q.T, A) +end + +function Dagger.DMatrix(Q::QRCompactWYQ{T, <:Dagger.DArray{T, 2}}) where {T} + DQ = distribute(Matrix(I*one(T), size(Q.factors)[1], size(Q.factors)[1]), Q.factors.partitioning) + porgqr!('N', Q.factors, Q.T, DQ) + return DQ +end + +function Dagger.DMatrix(AQ::AdjointQ{T, <:QRCompactWYQ{T, <:Dagger.DArray{T, 2}}}) where {T} + DQ = distribute(Matrix(I*one(T), size(AQ.Q.factors)[1], size(AQ.Q.factors)[1]), AQ.Q.factors.partitioning) + trans = T <: Complex ? 'C' : 'T' + porgqr!(trans, AQ.Q.factors, AQ.Q.T, DQ) + return DQ +end + +Base.collect(Q::QRCompactWYQ{T, <:Dagger.DArray{T, 2}}) where {T} = collect(Dagger.DMatrix(Q)) +Base.collect(AQ::AdjointQ{T, <:QRCompactWYQ{T, <:Dagger.DArray{T, 2}}}) where {T} = collect(Dagger.DMatrix(AQ)) + +function pormqr!(side::Char, trans::Char, A::Dagger.DArray{T, 2}, Tm::LowerTrapezoidal{T, <:Dagger.DMatrix{T}}, C::Dagger.DArray{T, 2}) where {T<:Number} + m, n = size(C) + Ac = A.chunks + Tc = Tm.data.chunks + Cc = C.chunks + + Amt, Ant = size(Ac) + Tmt, Tnt = size(Tc) + Cmt, Cnt = size(Cc) + minMT = min(Amt, Ant) + + Dagger.spawn_datadeps() do + if side == 'L' + if (trans == 'T' || trans == 'C') + for k in 1:minMT + for n in 1:Cnt + Dagger.@spawn coreblas_ormqr!(side, trans, In(Ac[k, k]), In(Tc[k, k]), InOut(Cc[k,n])) + end + for m in k+1:Cmt, n in 1:Cnt + Dagger.@spawn coreblas_tsmqr!(side, trans, InOut(Cc[k, n]), InOut(Cc[m, n]), In(Ac[m, k]), In(Tc[m, k])) + end + end + end + if trans == 'N' + for k in minMT:-1:1 + for m in Cmt:-1:k+1, n in 1:Cnt + Dagger.@spawn coreblas_tsmqr!(side, trans, InOut(Cc[k, n]), InOut(Cc[m, n]), In(Ac[m, k]), In(Tc[m, k])) + end + for n in 1:Cnt + Dagger.@spawn coreblas_ormqr!(side, trans, In(Ac[k, k]), In(Tc[k, k]), InOut(Cc[k, n])) + end + end + end + else + if side == 'R' + if trans == 'T' || trans == 'C' + for k in minMT:-1:1 + for n in Cmt:-1:k+1, m in 1:Cmt + Dagger.@spawn coreblas_tsmqr!(side, trans, InOut(Cc[m, k]), InOut(Cc[m, n]), In(Ac[n, k]), In(Tc[n, k])) + end + for m in 1:Cmt + Dagger.@spawn coreblas_ormqr!(side, trans, In(Ac[k, k]), In(Tc[k, k]), InOut(Cc[m, k])) + end + end + end + if trans == 'N' + for k in 1:minMT + for m in 1:Cmt + Dagger.@spawn coreblas_ormqr!(side, trans, In(Ac[k, k]), In(Tc[k, k]), InOut(Cc[m, k])) + end + for n in k+1:Cmt, m in 1:Cmt + Dagger.@spawn coreblas_tsmqr!(side, trans, InOut(Cc[m, k]), InOut(Cc[m, n]), In(Ac[n, k]), In(Tc[n, k])) + end + end + end + end + end + end + return C +end + +function cageqrf!(A::Dagger.DArray{T, 2}, Tm::LowerTrapezoidal{T, <:Dagger.DMatrix{T}}; static::Bool=true, traversal::Symbol=:inorder, p::Int64=1) where {T<: Number} + if p == 1 + return geqrf!(A, Tm; static, traversal) + end + Ac = A.chunks + mt, nt = size(Ac) + @assert mt % p == 0 "Number of tiles must be divisible by the number of domains" + mtd = Int64(mt/p) + Tc = Tm.data.chunks + proot = 1 + nxtmt = mtd + trans = T <: Complex ? 'C' : 'T' + Dagger.spawn_datadeps(;static, traversal) do + for k in 1:min(mt, nt) + if k > nxtmt + proot += 1 + nxtmt += mtd + end + for pt in proot:p + ibeg = 1 + (pt-1) * mtd + if pt == proot + ibeg = k + end + Dagger.@spawn coreblas_geqrt!(InOut(Ac[ibeg, k]), Out(Tc[ibeg,k])) + for n in k+1:nt + Dagger.@spawn coreblas_ormqr!('L', trans, Deps(Ac[ibeg, k], In(LowerTriangular)), In(Tc[ibeg,k]), InOut(Ac[ibeg, n])) + end + for m in ibeg+1:(pt * mtd) + Dagger.@spawn coreblas_tsqrt!(Deps(Ac[ibeg, k], InOut(UpperTriangular)), InOut(Ac[m, k]), Out(Tc[m,k])) + for n in k+1:nt + Dagger.@spawn coreblas_tsmqr!('L', trans, InOut(Ac[ibeg, n]), InOut(Ac[m, n]), In(Ac[m, k]), In(Tc[m,k])) + end + end + end + for m in 1:ceil(Int64, log2(p-proot+1)) + p1 = proot + p2 = p1 + 2^(m-1) + while p2 ≤ p + i1 = 1 + (p1-1) * mtd + i2 = 1 + (p2-1) * mtd + if p1 == proot + i1 = k + end + Dagger.@spawn coreblas_ttqrt!(Deps(Ac[i1, k], InOut(UpperTriangular)), Deps(Ac[i2, k], InOut(UpperTriangular)), Out(Tc[i2, k])) + for n in k+1:nt + Dagger.@spawn coreblas_ttmqr!('L', trans, InOut(Ac[i1, n]), InOut(Ac[i2, n]), Deps(Ac[i2, k], In(UpperTriangular)), In(Tc[i2, k])) + end + p1 += 2^m + p2 += 2^m + end + end + end + end +end + +function geqrf!(A::Dagger.DArray{T, 2}, Tm::LowerTrapezoidal{T, <:Dagger.DMatrix{T}}; static::Bool=true, traversal::Symbol=:inorder) where {T<: Number} + Ac = A.chunks + mt, nt = size(Ac) + Tc = Tm.data.chunks + trans = T <: Complex ? 'C' : 'T' + + Dagger.spawn_datadeps(;static, traversal) do + for k in 1:min(mt, nt) + Dagger.@spawn coreblas_geqrt!(InOut(Ac[k, k]), Out(Tc[k,k])) + for n in k+1:nt + Dagger.@spawn coreblas_ormqr!('L', trans, Deps(Ac[k,k], In(LowerTriangular)), In(Tc[k,k]), InOut(Ac[k, n])) + end + for m in k+1:mt + Dagger.@spawn coreblas_tsqrt!(Deps(Ac[k, k], InOut(UpperTriangular)), InOut(Ac[m, k]), Out(Tc[m,k])) + for n in k+1:nt + Dagger.@spawn coreblas_tsmqr!('L', trans, InOut(Ac[k, n]), InOut(Ac[m, n]), In(Ac[m, k]), In(Tc[m,k])) + end + end + end + end +end + +function porgqr!(trans::Char, A::Dagger.DArray{T, 2}, Tm::LowerTrapezoidal{T, <:Dagger.DMatrix{T}}, Q::Dagger.DArray{T, 2}; static::Bool=true, traversal::Symbol=:inorder) where {T<:Number} + Ac = A.chunks + Tc = Tm.data.chunks + Qc = Q.chunks + mt, nt = size(Ac) + qmt, qnt = size(Qc) + + Dagger.spawn_datadeps(;static, traversal) do + if trans == 'N' + for k in min(mt, nt):-1:1 + for m in qmt:-1:k + 1, n in k:qnt + Dagger.@spawn coreblas_tsmqr!('L', trans, InOut(Qc[k, n]), InOut(Qc[m, n]), In(Ac[m, k]), In(Tc[m, k])) + end + for n in k:qnt + Dagger.@spawn coreblas_ormqr!('L', trans, In(Ac[k, k]), + In(Tc[k, k]), InOut(Qc[k, n])) + end + end + else + for k in 1:min(mt, nt) + for n in 1:k + Dagger.@spawn coreblas_ormqr!('L', trans, In(Ac[k, k]), + In(Tc[k, k]), InOut(Qc[k, n])) + end + for m in k+1:qmt, n in 1:qnt + Dagger.@spawn coreblas_tsmqr!('L', trans, InOut(Qc[k, n]), InOut(Qc[m, n]), In(Ac[m, k]), In(Tc[m, k])) + end + end + end + end +end + +function meas_ws(A::Dagger.DArray{T, 2}, ib::Int64) where {T<: Number} + mb, nb = A.partitioning.blocksize + m, n = size(A) + MT = (mod(m,nb)==0) ? floor(Int64, (m / mb)) : floor(Int64, (m / mb) + 1) + NT = (mod(n,nb)==0) ? floor(Int64,(n / nb)) : floor(Int64, (n / nb) + 1) * 2 + lm = ib * MT; + ln = nb * NT; + lm, ln +end + +function LinearAlgebra.qr!(A::Dagger.DArray{T, 2}; ib::Int64=1, p::Int64=1) where {T<:Number} + lm, ln = meas_ws(A, ib) + Ac = A.chunks + nb = A.partitioning.blocksize[2] + mt, nt = size(Ac) + st = nb * (nt - 1) + Tm = LowerTrapezoidal(zeros, Blocks(ib, nb), T, st, lm, ln) + geqrf!(A, Tm) + return QRCompactWY(A, Tm); +end + + diff --git a/src/array/trapezoidal.jl b/src/array/trapezoidal.jl new file mode 100644 index 000000000..3497fa26a --- /dev/null +++ b/src/array/trapezoidal.jl @@ -0,0 +1,142 @@ +export LowerTrapezoidal, UnitLowerTrapezoidal, UpperTrapezoidal, UnitUpperTrapezoidal, trau!, trau, tral!, tral +import LinearAlgebra: triu!, tril!, triu, tril +abstract type AbstractTrapezoidal{T} <: AbstractMatrix{T} end + +# First loop through all methods that don't need special care for upper/lower and unit diagonal +for t in (:LowerTrapezoidal, :UnitLowerTrapezoidal, :UpperTrapezoidal, :UnitUpperTrapezoidal) + @eval begin + struct $t{T,S<:AbstractMatrix{T}} <: AbstractTrapezoidal{T} + data::S + + function $t{T,S}(data) where {T,S<:AbstractMatrix{T}} + Base.require_one_based_indexing(data) + new{T,S}(data) + end + end + $t(A::$t) = A + $t{T}(A::$t{T}) where {T} = A + $t(A::AbstractMatrix) = $t{eltype(A), typeof(A)}(A) + $t{T}(A::AbstractMatrix) where {T} = $t(convert(AbstractMatrix{T}, A)) + $t{T}(A::$t) where {T} = $t(convert(AbstractMatrix{T}, A.data)) + + AbstractMatrix{T}(A::$t) where {T} = $t{T}(A) + AbstractMatrix{T}(A::$t{T}) where {T} = copy(A) + + Base.size(A::$t) = size(A.data) + Base.axes(A::$t) = axes(A.data) + + Base.similar(A::$t, ::Type{T}) where {T} = $t(similar(parent(A), T)) + Base.similar(A::$t, ::Type{T}, dims::Dims{N}) where {T,N} = similar(parent(A), T, dims) + Base.parent(A::$t) = A.data + + Base.copy(A::$t) = $t(copy(A.data)) + + Base.real(A::$t{<:Real}) = A + Base.real(A::$t{<:Complex}) = (B = real(A.data); $t(B)) + end +end + +Base.getindex(A::UnitLowerTrapezoidal{T}, i::Integer, j::Integer) where {T} = + i > j ? ifelse(A.data[i,j] == nothing, zero(eltype(A.data)), A.data[i,j]) : ifelse(i == j, oneunit(T), zero(T)) +Base.getindex(A::LowerTrapezoidal, i::Integer, j::Integer) = +i >= j ? ifelse(A.data[i,j] == nothing, zero(eltype(A.data)), A.data[i,j]) : zero(eltype(A.data)) +Base.getindex(A::UnitUpperTrapezoidal{T}, i::Integer, j::Integer) where {T} = + i < j ? ifelse(A.data[i,j] == nothing, zero(eltype(A.data)), A.data[i,j]) : ifelse(i == j, oneunit(T), zero(T)) +Base.getindex(A::UpperTrapezoidal, i::Integer, j::Integer) = +i <= j ? ifelse(A.data[i,j] == nothing, zero(eltype(A.data)), A.data[i,j]) : zero(eltype(A.data)) + +function _DiagBuild(blockdom::Tuple, alloc::AbstractMatrix{T}, diag::Vector{Tuple{Int,Int}}, transform::Function) where {T} + diagind = findfirst(x-> x[1] in blockdom[1] && x[2] in blockdom[2], diag) + blockind = (diag[diagind][1] - first(blockdom[1]) + 1, diag[diagind][2] - first(blockdom[2]) + 1) + return Dagger.@spawn transform(alloc, blockind[2] - blockind[1]) +end + +function _GenericTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, wrap, k::Integer, dims) + d = ArrayDomain(map(x->1:x, dims)) + dc = partition(p, d) + if f isa UndefInitializer + f = (eltype, x...) -> Array{eltype}(undef, x...) + end + m, n = dims + if k < 0 + diag = [(i-k, i) for i in 1:min(m, n)] + else + diag = [(i, i+k) for i in 1:min(m, n)] + end + thunks = [] + alloc(sz) = f(eltype, sz) + transform = (wrap == LowerTrapezoidal) ? tril! : triu! + compar = (wrap == LowerTrapezoidal) ? (>) : (<) + for c in dc + sz = size(c) + if any(x -> x[1] in c.indexes[1] && x[2] in c.indexes[2], diag) + push!(thunks, _DiagBuild(c.indexes, alloc(sz), diag, transform)) + else + mt, nt = k<0 ? (first(c.indexes[1]), first(c.indexes[2])-k) : (first(c.indexes[1])+k, first(c.indexes[2])) + if compar(mt, nt) + push!(thunks, Dagger.@spawn alloc(sz)) + else + push!(thunks, Dagger.@spawn zeros(eltype, sz)) + end + end + end + thunks = reshape(thunks, size(dc)) + return wrap(Dagger.DArray(eltype, d, dc, thunks, p)) +end + +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, 0, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, 0, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, 0, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, 0, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, k::Integer, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, k, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, k::Integer, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, k, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, k::Integer, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, k, dims) +LowerTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, k::Integer, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, LowerTrapezoidal, k, dims) + +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, 0, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, 0, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, 0, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, 0, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, k::Integer, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, k, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, k::Integer, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, k, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, k::Integer, dims::Integer...) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, k, dims) +UpperTrapezoidal(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, k::Integer, dims::Tuple) = _GenericTrapezoidal(f, p, eltype, UpperTrapezoidal, k, dims) + +function _GenericTra!(A::Dagger.DArray{T, 2}, wrap::Function, k::Integer) where {T} + d = A.domain + dc = A.subdomains + Ac = A.chunks + m, n = size(A) + if k < 0 + diag = [(i-k, i) for i in 1:min(m, n)] + else + diag = [(i, i+k) for i in 1:min(m, n)] + end + compar = (wrap == tril!) ? (≤) : (≥) + for ind in CartesianIndices(dc) + sz = size(dc[ind]) + if any(x -> x[1] in dc[ind].indexes[1] && x[2] in dc[ind].indexes[2], diag) + Ac[ind] = _DiagBuild(dc[ind].indexes, fetch(Ac[ind]), diag, wrap) + else + mt, nt = k<0 ? (first(dc[ind].indexes[1]), first(dc[ind].indexes[2])-k) : (first(dc[ind].indexes[1])+k, first(dc[ind].indexes[2])) + if compar(mt, nt) + Ac[ind] = Dagger.@spawn zeros(T, sz) + end + end + end + return A +end + + + +trau!(A::Dagger.DArray{T,2}, k::Integer) where {T} = _GenericTra!(A, triu!, k) +trau!(A::Dagger.DArray{T,2}) where {T} = _GenericTra!(A, triu!, 0) +trau(A::Dagger.DArray{T,2}) where {T} = trau!(copy(A)) +trau(A::Dagger.DArray{T,2}, k::Integer) where {T} = trau!(copy(A), k) + +tral!(A::Dagger.DArray{T,2}, k::Integer) where {T} = _GenericTra!(A, tril!, k) +tral!(A::Dagger.DArray{T,2}) where {T} = _GenericTra!(A, tril!, 0) +tral(A::Dagger.DArray{T,2}) where {T} = tral!(copy(A)) +tral(A::Dagger.DArray{T,2}, k::Integer) where {T} = tral!(copy(A), k) + +#TODO: map, reduce, sum, mean, prod, reducedim, collect, distribute diff --git a/src/array/triangular.jl b/src/array/triangular.jl new file mode 100644 index 000000000..47613df2d --- /dev/null +++ b/src/array/triangular.jl @@ -0,0 +1,83 @@ +export LowerTriangular, UpperTriangular, tril, triu, tril!, triu! + +function _GenericTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, compar::Function, wrap, dims) + @assert dims[1] == dims[2] "matrix is not square: dimensions are $dims (try using trapezoidal)" + d = ArrayDomain(map(x->1:x, dims)) + dc = partition(p, d) + if f isa UndefInitializer + f = (eltype, x...) -> Array{eltype}(undef, x...) + end + diag = [(i,i) for i in 1:min(size(d)...)] + thunks = [] + alloc(sz) = f(eltype, sz) + transform = (wrap == LowerTrapezoidal) ? tril! : triu! + compar = (wrap == LowerTrapezoidal) ? (>) : (<) + for c in dc + sz = size(c) + if any(x -> x[1] in c.indexes[1] && x[2] in c.indexes[2], diag) + push!(thunks, _DiagBuild(c.indexes, alloc(sz), diag, transform)) + else + if compar(first(c.indexes[1]), first(c.indexes[2])) + push!(thunks, Dagger.@spawn alloc(sz)) + else + push!(thunks, Dagger.@spawn zeros(eltype, sz)) + end + end + end + thunks = reshape(thunks, size(dc)) + return wrap(Dagger.DArray(eltype, d, dc, thunks, p)) +end + +LinearAlgebra.LowerTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Integer...) = _GenericTriangular(f, p, eltype, LowerTriangular, dims) +LinearAlgebra.LowerTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Tuple) = _GenericTriangular(f, p, eltype, LowerTriangular, dims) +LinearAlgebra.LowerTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Integer...) = _GenericTriangular(f, p, eltype, LowerTriangular, dims) +LinearAlgebra.LowerTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Tuple) = _GenericTriangular(f, p, eltype, LowerTriangular, dims) + +LinearAlgebra.UpperTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Integer...) = _GenericTriangular(f, p, eltype, UpperTriangular, dims) +LinearAlgebra.UpperTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, dims::Tuple) = _GenericTriangular(f, p, eltype, UpperTriangular, dims) +LinearAlgebra.UpperTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Integer...) = _GenericTriangular(f, p, eltype, UpperTriangular, dims) +LinearAlgebra.UpperTriangular(f::Union{Function, UndefInitializer}, p::Blocks{2}, eltype::Type, dims::Tuple) = _GenericTriangular(f, p, eltype, UpperTriangular, dims) + +function _GenericTri!(A::Dagger.DArray{T, 2}, wrap) where {T} + LinearAlgebra.checksquare(A) + d = A.domain + dc = A.subdomains + Ac = A.chunks + diag = [(i,i) for i in 1:min(size(d)...)] + compar = (wrap == tril!) ? (>) : (<) + for ind in CartesianIndices(dc) + sz = size(dc[ind]) + if any(x -> x[1] in dc[ind].indexes[1] && x[2] in dc[ind].indexes[2], diag) + Ac[ind] = _DiagBuild(dc[ind].indexes, fetch(Ac[ind]), diag, wrap) + else + if compar(first(dc[ind].indexes[2]), first(dc[ind].indexes[1])) + Ac[ind] = Dagger.@spawn zeros(T, sz) + end + end + end + return A +end + +function LinearAlgebra.triu!(A::Dagger.DArray{T,2}) where {T} + if size(A, 1) != size(A, 2) + trau!(A) + else + _GenericTri!(A, triu!) + end +end +LinearAlgebra.triu(A::Dagger.DArray{T,2}) where {T} = triu!(copy(A)) + +function LinearAlgebra.tril!(A::Dagger.DArray{T,2}) where {T} + if size(A, 1) != size(A, 2) + tral!(A) + else + _GenericTri!(A, tril!) + end +end +LinearAlgebra.tril(A::Dagger.DArray{T,2}) where {T} = tril!(copy(A)) + +#TODO: matmul + + + + diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index 1a734850b..e3e90a124 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -136,7 +136,9 @@ aliasing(x::Transpose) = aliasing(parent(x)) aliasing(x::Adjoint) = aliasing(parent(x)) struct StridedAliasing{T,N,S} <: AbstractAliasing + base_ptr::RemotePtr{Cvoid,S} ptr::RemotePtr{Cvoid,S} + base_inds::NTuple{N,UnitRange{Int}} lengths::NTuple{N,Int} strides::NTuple{N,Int} end @@ -161,10 +163,12 @@ function _memory_spans(a::StridedAliasing{T,N,S}, spans, ptr, dim) where {T,N,S} return spans end -function aliasing(x::SubArray{T}) where T +function aliasing(x::SubArray{T,N,A}) where {T,N,A<:Array} if isbitstype(T) S = CPURAMMemorySpace - return StridedAliasing{T,ndims(x),S}(RemotePtr{Cvoid}(pointer(x)), + return StridedAliasing{T,ndims(x),S}(RemotePtr{Cvoid}(pointer(parent(x))), + RemotePtr{Cvoid}(pointer(x)), + parentindices(x), size(x), strides(parent(x))) else # FIXME: Also ContiguousAliasing of container @@ -172,21 +176,21 @@ function aliasing(x::SubArray{T}) where T return UnknownAliasing() end end -#= TODO: Fix and enable strided aliasing optimization function will_alias(x::StridedAliasing{T,N,S}, y::StridedAliasing{T,N,S}) where {T,N,S} - # TODO: Upgrade Contiguous/StridedAlising to same number of dims + if x.base_ptr != y.base_ptr + # FIXME: Conservatively incorrect via `unsafe_wrap` and friends + return false + end + for dim in 1:N - # FIXME: Adjust ptrs to common base - x_span = MemorySpan{S}(x.ptr, sizeof(T)*x.strides[dim]) - y_span = MemorySpan{S}(y.ptr, sizeof(T)*y.strides[dim]) - @show dim x_span y_span - if !will_alias(x_span, y_span) + if ((x.base_inds[dim].stop) < (y.base_inds[dim].start) || (y.base_inds[dim].stop) < (x.base_inds[dim].start)) return false end end + return true end -=# +# FIXME: Upgrade Contiguous/StridedAlising to same number of dims struct TriangularAliasing{T,S} <: AbstractAliasing ptr::RemotePtr{Cvoid,S} diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index a6b2ce6b4..cfb680bbd 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -405,7 +405,20 @@ function cleanup_proc(state, p, log_sink) delete!(WORKER_MONITOR_CHANS[wid], state.uid) end end - remote_do(_cleanup_proc, wid, state.uid, log_sink) + + # If the worker process is still alive, clean it up + if wid in workers() + try + remotecall_wait(_cleanup_proc, wid, state.uid, log_sink) + catch ex + # We allow ProcessExitedException's, which means that the worker + # shutdown halfway through cleanup. + if !(ex isa ProcessExitedException) + rethrow() + end + end + end + timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing) end diff --git a/src/sch/dynamic.jl b/src/sch/dynamic.jl index 7c52bf748..df78a1dd1 100644 --- a/src/sch/dynamic.jl +++ b/src/sch/dynamic.jl @@ -33,9 +33,18 @@ function safepoint(state) if state.halt.set # Force dynamic thunks and listeners to terminate for (inp_chan,out_chan) in values(state.worker_chans) - close(inp_chan) - close(out_chan) + # Closing these channels will fail if the worker died, which we + # allow. + try + close(inp_chan) + close(out_chan) + catch ex + if !(ex isa ProcessExitedException) + rethrow() + end + end end + # Throw out of scheduler throw(SchedulerHaltedException()) end diff --git a/src/thunk.jl b/src/thunk.jl index 177239979..017a69a9d 100644 --- a/src/thunk.jl +++ b/src/thunk.jl @@ -306,7 +306,7 @@ generated thunks. macro par(exs...) opts = exs[1:end-1] ex = exs[end] - _par(ex; lazy=true, opts=opts) + return esc(_par(ex; lazy=true, opts=opts)) end """ @@ -348,7 +348,7 @@ also passes along any options in an `Options` struct. For example, macro spawn(exs...) opts = exs[1:end-1] ex = exs[end] - _par(ex; lazy=false, opts=opts) + return esc(_par(ex; lazy=false, opts=opts)) end struct ExpandedBroadcast{F} end @@ -363,26 +363,44 @@ function replace_broadcast(fn::Symbol) end function _par(ex::Expr; lazy=true, recur=true, opts=()) - if ex.head == :call && recur - f = replace_broadcast(ex.args[1]) - if length(ex.args) >= 2 && Meta.isexpr(ex.args[2], :parameters) - args = ex.args[3:end] - kwargs = ex.args[2] - else - args = ex.args[2:end] - kwargs = Expr(:parameters) + f = nothing + body = nothing + arg1 = nothing + if recur && @capture(ex, f_(allargs__)) || @capture(ex, f_(allargs__) do cargs_ body_ end) || @capture(ex, allargs__->body_) || @capture(ex, arg1_[allargs__]) + f = replace_broadcast(f) + if arg1 !== nothing + # Indexing (A[2,3]) + f = Base.getindex + pushfirst!(allargs, arg1) + end + args = filter(arg->!Meta.isexpr(arg, :parameters), allargs) + kwargs = filter(arg->Meta.isexpr(arg, :parameters), allargs) + if !isempty(kwargs) + kwargs = only(kwargs).args + end + if body !== nothing + if f !== nothing + f = quote + ($(args...); $(kwargs...))->$f($(args...); $(kwargs...)) do $cargs + $body + end + end + else + f = quote + ($(args...); $(kwargs...))->begin + $body + end + end + end end - opts = esc.(opts) - args_ex = _par.(args; lazy=lazy, recur=false) - kwargs_ex = _par.(kwargs.args; lazy=lazy, recur=false) if lazy - return :(Dagger.delayed($(esc(f)), $Options(;$(opts...)))($(args_ex...); $(kwargs_ex...))) + return :(Dagger.delayed($f, $Options(;$(opts...)))($(args...); $(kwargs...))) else - sync_var = esc(Base.sync_varname) + sync_var = Base.sync_varname @gensym result return quote - let args = ($(args_ex...),) - $result = $spawn($(esc(f)), $Options(;$(opts...)), args...; $(kwargs_ex...)) + let + $result = $spawn($f, $Options(;$(opts...)), $(args...); $(kwargs...)) if $(Expr(:islocal, sync_var)) put!($sync_var, schedule(Task(()->wait($result)))) end @@ -390,12 +408,17 @@ function _par(ex::Expr; lazy=true, recur=true, opts=()) end end end + elseif lazy + # Recurse into the expression + return Expr(ex.head, _par_inner.(ex.args, lazy=lazy, recur=recur, opts=opts)...) else - return Expr(ex.head, _par.(ex.args, lazy=lazy, recur=recur, opts=opts)...) + throw(ArgumentError("Invalid Dagger task expression: $ex")) end end -_par(ex::Symbol; kwargs...) = esc(ex) -_par(ex; kwargs...) = ex +_par(ex; kwargs...) = throw(ArgumentError("Invalid Dagger task expression: $ex")) + +_par_inner(ex; kwargs...) = ex +_par_inner(ex::Expr; kwargs...) = _par(ex; kwargs...) """ Dagger.spawn(f, args...; kwargs...) -> DTask diff --git a/test/array/linalg/qr.jl b/test/array/linalg/qr.jl new file mode 100644 index 000000000..2a347d164 --- /dev/null +++ b/test/array/linalg/qr.jl @@ -0,0 +1,36 @@ + @testset "Tile QR: $T" for T in (Float32, Float64, ComplexF32, ComplexF64) + ## Square matrices + A = rand(T, 128, 128) + Q, R = qr(A) + DA = distribute(A, Blocks(32,32)) + DQ, DR = qr!(DA) + @test abs.(DQ) ≈ abs.(Q) + @test abs.(DR) ≈ abs.(R) + @test DQ * DR ≈ A + @test DQ' * DQ ≈ I + @test I * abs.(DQ) ≈ abs.(Q) + @test I * abs.(DQ') ≈ abs.(Q') + + ## Rectangular matrices (block and element wise) + # Tall Element and Block + A = rand(T, 128, 64) + Q, R = qr(A) + DA = distribute(A, Blocks(32,32)) + DQ, DR = qr!(DA) + @test abs.(DR) ≈ abs.(R) + @test DQ * DR ≈ A + @test DQ' * DQ ≈ I + @test I * DQ ≈ collect(DQ) + @test I * DQ' ≈ collect(DQ') + + # Wide Element and Block + A = rand(T, 64, 128) + Q, R = qr(A) + DA = distribute(A, Blocks(16,16)) + DQ, DR = qr!(DA) + @test abs.(DR) ≈ abs.(R) + @test DQ * DR ≈ A + @test DQ' * DQ ≈ I + @test I * DQ ≈ collect(DQ) + @test I * DQ' ≈ collect(DQ') +end diff --git a/test/datadeps.jl b/test/datadeps.jl index e4b4c811f..271e2c667 100644 --- a/test/datadeps.jl +++ b/test/datadeps.jl @@ -337,6 +337,59 @@ function test_datadeps(;args_chunks::Bool, test_task_dominators(logs, tid_lower2, [tid_B, tid_lower, tid_unitlower, tid_diag, tid_unitlower2]; all_tids=tids_all, nondom_check=false) test_task_dominators(logs, tid_unitupper2, [tid_B, tid_upper, tid_unitupper]; all_tids=tids_all, nondom_check=false) test_task_dominators(logs, tid_upper2, [tid_B, tid_upper, tid_unitupper, tid_diag, tid_unitupper2]; all_tids=tids_all, nondom_check=false) + + # Additional aliasing tests + views_overlap(x, y) = Dagger.will_alias(Dagger.aliasing(x), Dagger.aliasing(y)) + + A = wrap_chunk_thunk(identity, B) + + A_r1 = wrap_chunk_thunk(view, A, 1:1, 1:4) + A_r2 = wrap_chunk_thunk(view, A, 2:2, 1:4) + B_r1 = wrap_chunk_thunk(view, B, 1:1, 1:4) + B_r2 = wrap_chunk_thunk(view, B, 2:2, 1:4) + + A_c1 = wrap_chunk_thunk(view, A, 1:4, 1:1) + A_c2 = wrap_chunk_thunk(view, A, 1:4, 2:2) + B_c1 = wrap_chunk_thunk(view, B, 1:4, 1:1) + B_c2 = wrap_chunk_thunk(view, B, 1:4, 2:2) + + A_mid = wrap_chunk_thunk(view, A, 2:3, 2:3) + B_mid = wrap_chunk_thunk(view, B, 2:3, 2:3) + + @test views_overlap(A_r1, A_r1) + @test views_overlap(B_r1, B_r1) + @test views_overlap(A_c1, A_c1) + @test views_overlap(B_c1, B_c1) + + @test views_overlap(A_r1, B_r1) + @test views_overlap(A_r2, B_r2) + @test views_overlap(A_c1, B_c1) + @test views_overlap(A_c2, B_c2) + + @test !views_overlap(A_r1, A_r2) + @test !views_overlap(B_r1, B_r2) + @test !views_overlap(A_c1, A_c2) + @test !views_overlap(B_c1, B_c2) + + @test views_overlap(A_r1, A_c1) + @test views_overlap(A_r1, B_c1) + @test views_overlap(A_r2, A_c2) + @test views_overlap(A_r2, B_c2) + + for (name, mid) in ((:A_mid, A_mid), (:B_mid, B_mid)) + @test !views_overlap(A_r1, mid) + @test !views_overlap(B_r1, mid) + @test !views_overlap(A_c1, mid) + @test !views_overlap(B_c1, mid) + + @test views_overlap(A_r2, mid) + @test views_overlap(B_r2, mid) + @test views_overlap(A_c2, mid) + @test views_overlap(B_c2, mid) + end + + @test views_overlap(A_mid, A_mid) + @test views_overlap(A_mid, B_mid) end # FIXME: Deps diff --git a/test/runtests.jl b/test/runtests.jl index 3f4e1b1ca..fbab200e8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ tests = [ ("Array - MapReduce", "array/mapreduce.jl"), ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl"), ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"), + ("Array - LinearAlgebra - QR", "array/linalg/qr.jl"), ("Caching", "cache.jl"), ("Disk Caching", "diskcaching.jl"), ("File IO", "file-io.jl"), @@ -31,6 +32,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ pushfirst!(LOAD_PATH, joinpath(@__DIR__, "..")) using Pkg Pkg.activate(@__DIR__) + Pkg.instantiate() using ArgParse s = ArgParseSettings(description = "Dagger Testsuite") diff --git a/test/thunk.jl b/test/thunk.jl index db92ee340..95763e8b9 100644 --- a/test/thunk.jl +++ b/test/thunk.jl @@ -79,6 +79,70 @@ end @test fetch(@spawn A .+ B) ≈ A .+ B @test fetch(@spawn A .* B) ≈ A .* B end + @testset "inner macro" begin + A = rand(4) + t = @spawn sum(@view A[2:3]) + @test t isa Dagger.DTask + @test fetch(t) ≈ sum(@view A[2:3]) + end + @testset "do block" begin + A = rand(4) + + t = @spawn sum(A) do a + a + 1 + end + @test t isa Dagger.DTask + @test fetch(t) ≈ sum(a->a+1, A) + + t = @spawn sum(A; dims=1) do a + a + 1 + end + @test t isa Dagger.DTask + @test fetch(t) ≈ sum(a->a+1, A; dims=1) + + do_f = f -> f(42) + t = @spawn do_f() do x + x + 1 + end + @test t isa Dagger.DTask + @test fetch(t) == 43 + end + @testset "anonymous direct call" begin + A = rand(4) + + t = @spawn A->sum(A) + @test t isa Dagger.DTask + @test fetch(t) == sum(A) + + t = @spawn A->sum(A; dims=1) + @test t isa Dagger.DTask + @test fetch(t) == sum(A; dims=1) + end + @testset "getindex" begin + A = rand(4, 4) + + t = @spawn A[1, 2] + @test t isa Dagger.DTask + @test fetch(t) == A[1, 2] + + B = Dagger.@spawn rand(4, 4) + t = @spawn B[1, 2] + @test t isa Dagger.DTask + @test fetch(t) == fetch(B)[1, 2] + + R = Ref(42) + t = @spawn R[] + @test t isa Dagger.DTask + @test fetch(t) == 42 + end + @testset "invalid expression" begin + @test_throws LoadError eval(:(@spawn 1)) + @test_throws LoadError eval(:(@spawn begin 1 end)) + @test_throws LoadError eval(:(@spawn begin + 1+1 + 1+1 + end)) + end @testset "waiting" begin a = @spawn sleep(1) @test !isready(a)