Skip to content

Commit

Permalink
Reduce allocations in push! + modify tests
Browse files Browse the repository at this point in the history
  • Loading branch information
RAYNAUD Paul (raynaudp) committed Jan 18, 2023
1 parent cab923d commit 1e51035
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 28 deletions.
63 changes: 39 additions & 24 deletions src/compressed_lbfgs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ In addition to this structures which are circurlarly update when `k` reaches `m`
- `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix;
- `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions;
- `sol`: a vector ∈ Rᵏ to store intermediate solutions;
- `intermediate_structure_updated`: inform if the intermediate structures are up-to-date or not.
This implementation is designed to work either on CPU or GPU.
"""
mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}}
Expand All @@ -50,26 +49,26 @@ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}}
Lₖ::LowerTriangular{T,M} # m * m

chol_matrix::M # 2m * 2m
intermediate_diagonal::Diagonal{T,V} # m * m
intermediate_1::UpperTriangular{T,M} # 2m * 2m
intermediate_2::LowerTriangular{T,M} # 2m * 2m
inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m
inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m
intermediary_vector::V # 2m
sol::V # m
intermediate_structure_updated::Bool
end

default_gpu() = CUDA.functional() ? true : false
default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T}
default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T}
default_matrix_type(gpu::Bool; T::DataType=Float64) = gpu ? CuMatrix{T} : Matrix{T}
default_vector_type(gpu::Bool; T::DataType=Float64) = gpu ? CuVector{T} : Vector{T}

"""
CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool)
A implementation of a LBFGS operator (forward), representing a `nxn` linear application.
It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU.
"""
function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T))
function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T))
α = (T)(1)
k = 0
Sₖ = M(undef, n, m)
Expand All @@ -78,14 +77,14 @@ function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=defau
Lₖ = LowerTriangular(M(undef, m, m))

chol_matrix = M(undef, m, m)
intermediate_diagonal = Diagonal(V(undef, m))
intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m))
intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m))
inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m))
inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m))
intermediary_vector = V(undef, 2*m)
sol = V(undef, 2*m)
intermediate_structure_updated = false
return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated)
return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol)
end

function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}}
Expand All @@ -109,13 +108,16 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra
# for the time being, reinstantiate completely the Lₖ matrix
for j in 1:op.k
for i in 1:j-1
op.Lₖ.data[j, i] = dot(op.Sₖ[:, j], op.Yₖ[:, i])
op.Lₖ.data[j, i] = dot(view(op.Sₖ,:, j), view(op.Yₖ, :, i))
end
end
end

# step 4 and 6
precompile_iterated_structure!(op)

# secant equation fails if uncommented
# op.α = dot(y,s)/dot(s,s)
op.intermediate_structure_updated = false
return op
end

Expand All @@ -141,8 +143,18 @@ end

# Algorithm 3.2 (p15)
# step 4, Jₖ is computed only if needed
function inverse_cholesky(op::CompressedLBFGS)
view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V}
view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k))

# view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k)))
mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k))

# view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k))
mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1))

# view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))

cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k)))
Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k)))
return Jₖ
Expand All @@ -152,29 +164,32 @@ end
function precompile_iterated_structure!(op::CompressedLBFGS)
Jₖ = inverse_cholesky(op)

view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.Dₖ, 1:op.k, 1:op.k)^(1/2)
view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
# constant update
view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0
view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ)

view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(1/2)
view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0
view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2)
view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ)
view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ

# updates related to D^(1/2)
view(op.intermediate_diagonal.diag, 1:op.k) .= sqrt.(view(op.Dₖ.diag, 1:op.k))
view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.intermediate_diagonal, 1:op.k, 1:op.k)
view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.intermediate_diagonal, 1:op.k, 1:op.k)

# updates related to D^(-1/2)
view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k))
mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k)))
# view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k))
view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1
# view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2)

view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k])
view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k])

op.intermediate_structure_updated = true
end

# Algorithm 3.2 (p15)
function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}}
# step 1-3 mainly done by Base.push!

# steps 4 and 6, in case the intermediary structures required are not up to date
(!op.intermediate_structure_updated) && (precompile_iterated_structure!(op))

# step 1-4 and 6 mainly done by Base.push!
# step 5
mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v)
mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v)
Expand Down
10 changes: 6 additions & 4 deletions test/test_clbfgs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
n=100
n=5
lbfgs = CompressedLBFGS(n) # m=5
Bv = rand(n)
V = LinearOperators.default_vector_type(LinearOperators.default_gpu())
Bv = V(rand(n))
s = V(rand(n))
mul!(Bv, lbfgs, s) # warm-up
for i in 1:iter
s = rand(n)
y = rand(n)
s = V(rand(n))
y = V(rand(n))
push!(lbfgs, s, y)
# warmp-up computing the mandatory intermediate structures
mul!(Bv, lbfgs, s)
allocs = @allocated mul!(Bv, lbfgs, s)
@test allocs == 0
@test Bv y
Expand Down

0 comments on commit 1e51035

Please sign in to comment.