From 1e51035273717d907868bc0e81f04fe562bb9431 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 18 Jan 2023 12:16:12 +0200 Subject: [PATCH] Reduce allocations in push! + modify tests --- src/compressed_lbfgs.jl | 63 +++++++++++++++++++++++++---------------- test/test_clbfgs.jl | 10 ++++--- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index e17669c..a0293ec 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -36,7 +36,6 @@ In addition to this structures which are circurlarly update when `k` reaches `m` - `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix; - `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions; - `sol`: a vector ∈ Rᵏ to store intermediate solutions; -- `intermediate_structure_updated`: inform if the intermediate structures are up-to-date or not. This implementation is designed to work either on CPU or GPU. """ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} @@ -50,18 +49,18 @@ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} Lₖ::LowerTriangular{T,M} # m * m chol_matrix::M # 2m * 2m + intermediate_diagonal::Diagonal{T,V} # m * m intermediate_1::UpperTriangular{T,M} # 2m * 2m intermediate_2::LowerTriangular{T,M} # 2m * 2m inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m intermediary_vector::V # 2m sol::V # m - intermediate_structure_updated::Bool end default_gpu() = CUDA.functional() ? true : false -default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} -default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} +default_matrix_type(gpu::Bool; T::DataType=Float64) = gpu ? CuMatrix{T} : Matrix{T} +default_vector_type(gpu::Bool; T::DataType=Float64) = gpu ? CuVector{T} : Vector{T} """ CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool) @@ -69,7 +68,7 @@ default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} A implementation of a LBFGS operator (forward), representing a `nxn` linear application. It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. """ -function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T)) +function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T)) α = (T)(1) k = 0 Sₖ = M(undef, n, m) @@ -78,14 +77,14 @@ function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=defau Lₖ = LowerTriangular(M(undef, m, m)) chol_matrix = M(undef, m, m) + intermediate_diagonal = Diagonal(V(undef, m)) intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) intermediary_vector = V(undef, 2*m) sol = V(undef, 2*m) - intermediate_structure_updated = false - return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated) + return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol) end function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} @@ -109,13 +108,16 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra # for the time being, reinstantiate completely the Lₖ matrix for j in 1:op.k for i in 1:j-1 - op.Lₖ.data[j, i] = dot(op.Sₖ[:, j], op.Yₖ[:, i]) + op.Lₖ.data[j, i] = dot(view(op.Sₖ,:, j), view(op.Yₖ, :, i)) end end end + + # step 4 and 6 + precompile_iterated_structure!(op) + # secant equation fails if uncommented # op.α = dot(y,s)/dot(s,s) - op.intermediate_structure_updated = false return op end @@ -141,8 +143,18 @@ end # Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed -function inverse_cholesky(op::CompressedLBFGS) - view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) +function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V} + view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k)) + + # view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) + mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k)) + + # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) + mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1)) + + # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ @@ -152,29 +164,32 @@ end function precompile_iterated_structure!(op::CompressedLBFGS) Jₖ = inverse_cholesky(op) - view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) - view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + # constant update view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0 - view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) - - view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0 - view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) + view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ + # updates related to D^(1/2) + view(op.intermediate_diagonal.diag, 1:op.k) .= sqrt.(view(op.Dₖ.diag, 1:op.k)) + view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.intermediate_diagonal, 1:op.k, 1:op.k) + view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.intermediate_diagonal, 1:op.k, 1:op.k) + + # updates related to D^(-1/2) + view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k)) + mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) + # view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k)) + view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1 + # view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) + view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k]) view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k]) - - op.intermediate_structure_updated = true end # Algorithm 3.2 (p15) function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} - # step 1-3 mainly done by Base.push! - - # steps 4 and 6, in case the intermediary structures required are not up to date - (!op.intermediate_structure_updated) && (precompile_iterated_structure!(op)) - + # step 1-4 and 6 mainly done by Base.push! # step 5 mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v) diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl index 3f5f6de..dc929d7 100644 --- a/test/test_clbfgs.jl +++ b/test/test_clbfgs.jl @@ -3,13 +3,15 @@ n=100 n=5 lbfgs = CompressedLBFGS(n) # m=5 - Bv = rand(n) + V = LinearOperators.default_vector_type(LinearOperators.default_gpu()) + Bv = V(rand(n)) + s = V(rand(n)) + mul!(Bv, lbfgs, s) # warm-up for i in 1:iter - s = rand(n) - y = rand(n) + s = V(rand(n)) + y = V(rand(n)) push!(lbfgs, s, y) # warmp-up computing the mandatory intermediate structures - mul!(Bv, lbfgs, s) allocs = @allocated mul!(Bv, lbfgs, s) @test allocs == 0 @test Bv ≈ y