From 65b64c850267803ecd9c2f43c7c4b7d7f6300c46 Mon Sep 17 00:00:00 2001
From: Guillaume Vigne <guillaume.vigne007@gmail.com>
Date: Fri, 21 Oct 2022 11:36:01 +0200
Subject: [PATCH] Merge DFTK's implementation of LOBPCG into IterativeSolvers

---
 src/lobpcg.jl | 1367 +++++++++++++++++--------------------------------
 1 file changed, 481 insertions(+), 886 deletions(-)

diff --git a/src/lobpcg.jl b/src/lobpcg.jl
index 57eefec2..c75a1868 100644
--- a/src/lobpcg.jl
+++ b/src/lobpcg.jl
@@ -1,962 +1,557 @@
-#=
-The code below was derived from the scipy implementation of the LOBPCG algorithm in https://github.com/scipy/scipy/blob/v1.1.0/scipy/sparse/linalg/eigen/lobpcg/lobpcg.py#L109-L568.
-
-Since the link above mentions the license is BSD license, the notice for the BSD license 2.0 is hereby given below giving credit to the authors of the Python implementation.
-
-Copyright (c) 2018, Robert Cimrman, Andrew Knyazev
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * The names of the contributors may not be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-=#
-
-export lobpcg, lobpcg!, LOBPCGIterator
+# Implementation of the LOBPCG algorithm, seeking optimal performance
+# and stability (it should be very, very hard to break, even at tight
+# tolerance criteria). The code is not 100% optimized yet, but the
+# most time-consuming parts (BLAS3 and matvecs) should be OK. The
+# implementation follows the scheme of Hetmaniuk & Lehoucq (see also
+# the refinements in Duersch et. al.), with the following
+# modifications:
+
+# - Cholesky factorizations are used instead of the eigenvalue
+# decomposition of Stathopolous & Wu for the orthogonalization.
+# Cholesky orthogonalization is fast but has an unwarranted bad
+# reputation: when applied twice to a matrix X with κ(X) <~
+# sqrt(1/ε), where ε is the machine epsilon, it will produce a set
+# of very orthogonal vectors, just as the eigendecomposition-based
+# method. It can fail when κ >~ sqrt(1/ε), but that can be fixed by
+# shifting the overlap matrix. This is very reliable while being
+# much faster than eigendecompositions.
+
+# - Termination criteria for the orthogonalization are based on cheap
+# estimates instead of costly explicit checks.
+
+# - The default tolerances are very tight, yielding a very stable
+# algorithm. This can be tweaked (see keyword ortho_tol) to
+# compromise on stability for less Cholesky factorizations.
+
+# - Implicit product updates (reuse of matvecs) are performed whenever
+# it is safe to do so, ie only with orthogonal transformations. An
+# exception is the B matrix reuse, which seems to be OK even with very
+# badly conditioned B matrices. The code is easy to modify if this is
+# not the case.
+
+# - The locking is performed carefully to ensure minimal impact on the
+# other eigenvectors (which is not the case in many - all ? - other
+# implementations)
+
+
+## TODO micro-optimization of buffer reuse
+## TODO write a version that doesn't assume that B is well-conditionned, and doesn't reuse B applications at all
+## TODO it seems there is a lack of orthogonalization immediately after locking, maybe investigate this to save on some choleskys
+## TODO debug orthogonalizations when A=I
+export lobpcg
 
 using LinearAlgebra
-using Random
+import Base: *
+import Base.size, Base.adjoint, Base.Array
 
-struct LOBPCGState{TR,TL}
-    iteration::Int
-    residual_norms::TR
-    ritz_values::TL
-end
-function Base.show(io::IO, t::LOBPCGState)
-    @printf io "%8d    %14e\n" t.iteration maximum(t.residual_norms)
-    return
-end
-
-const LOBPCGTrace{TR,TL} = Vector{LOBPCGState{TR,TL}}
-function Base.show(io::IO, tr::LOBPCGTrace)
-    @printf io "Iteration    Maximum residual norm \n"
-    @printf io "---------    ---------------------\n"
-    for state in tr
-        show(io, state)
-    end
-    return
-end
+# vprintln(args...) = println(args...)  # Uncomment for output
+vprintln(args...) = nothing
 
-struct LOBPCGResults{TL, TX, T, TR, TI, TM, TB, TTrace <: Union{LOBPCGTrace, AbstractVector{<:LOBPCGTrace}}}
+struct LOBPCGResults{TL<: AbstractArray, TX <: AbstractArray}
     λ::TL
     X::TX
-    tolerance::T
-    residual_norms::TR
-    iterations::TI
-    maxiter::TM
-    converged::TB
-    trace::TTrace
-end
-function EmptyLOBPCGResults(X::TX, k::Integer, tolerance, maxiter) where {T, TX<:AbstractMatrix{T}}
-    blocksize = size(X,2)
-    λ = Vector{T}(undef, k)
-    X = TX(undef, size(X, 1), k)
-
-    iterations = zeros(Int, ceil(Int, k/blocksize))
-    residual_norms = copy(λ)
-    converged = falses(k)
-    trace = fill(LOBPCGTrace{Vector{real(T)},Vector{T}}(), k÷blocksize+1)
-
-    return LOBPCGResults(λ, X, tolerance, residual_norms, iterations, maxiter, converged, trace)
-end
-
-function Base.append!(r1::LOBPCGResults, r2::LOBPCGResults, n1, n2=length(r2.λ))
-    n = n1 + n2
-    r1.λ[n1+1:n] .= @view r2.λ[end-n2+1:end]
-    r1.residual_norms[n1+1:n] .= @view r2.residual_norms[end-n2+1:end]
-    r1.X[:, n1+1:n] .= @view r2.X[:, end-n2+1:end]
-
-    ind = n1 ÷ length(r2.λ) + 1
-    r1.iterations[ind] = r2.iterations
-    r1.converged[n1+1:n] .= r2.converged
-    r1.trace[ind] = r2.trace
-
-    return r1
-end
-function Base.show(io::IO, r::LOBPCGResults)
-    first_two(fr) = [x for (i, x) in enumerate(fr)][1:2]
-
-    @printf io "Results of LOBPCG Algorithm\n"
-    @printf io " * Algorithm: LOBPCG - CholQR\n"
-
-    if length(join(r.λ, ",")) < 40 || length(r.λ) <= 2
-        @printf io " * λ: [%s]\n" join(r.λ, ",")
-    else
-        @printf io " * λ: [%s, ...]\n" join(first_two(r.λ), ",")
-    end
-
-    if length(join(r.residual_norms, ",")) < 40 || length(r.residual_norms) <= 2
-        @printf io " * Residual norm(s): [%s]\n" join(r.residual_norms, ",")
-    else
-        @printf io " * Residual norm(s): [%s, ...]\n" join(first_two(r.residual_norms), ",")
-    end
-    @printf io " * Convergence\n"
-    @printf io "   * Iterations: %s\n" r.iterations
-    @printf io "   * Converged: %s\n" all(r.converged)
-    @printf io "   * Iterations limit: %s\n" r.maxiter
-
-    return
-end
-
-struct Blocks{Generalized, T, TA<:AbstractArray{T}}
-    block::TA # X, R or P
-    A_block::TA # AX, AR or AP
-    B_block::TA # BX, BR or BP
-end
-Blocks(X, AX) = Blocks{false, eltype(X), typeof(X)}(X, AX, X)
-Blocks(X, AX, BX) = Blocks{true, eltype(X), typeof(X)}(X, AX, BX)
-function A_mul_X!(b::Blocks, A)
-    mul!(b.A_block, A, b.block)
-    return
-end
-function A_mul_X!(b::Blocks, A, n)
-    @views mul!(b.A_block[:, 1:n], A, b.block[:, 1:n])
-    return
-end
-function B_mul_X!(b::Blocks{true}, B)
-    mul!(b.B_block, B, b.block)
-    return
-end
-function B_mul_X!(b::Blocks{true}, B, n)
-    @views mul!(b.B_block[:, 1:n], B, b.block[:, 1:n])
-    return
-end
-function B_mul_X!(b::Blocks{false}, B, n = 0)
-    return
-end
-
-mutable struct Constraint{T, TVorM<:Union{AbstractVector{T}, AbstractMatrix{T}}, TM<:AbstractMatrix{T}, TC}
-    Y::TVorM
-    BY::TVorM
-    gram_chol::TC
-    gramYBV::TM # to be used in view
-    tmp::TM # to be used in view
+    tolerance::Real
+    residual_norms::Vector
+    residual_history::Matrix
+    iterations::Integer
+    maxiter::Integer
+    converged::Bool
+    largest::Bool
+    mvps:: Integer
 end
-struct BWrapper end
-struct NotBWrapper end
 
-Constraint(::Nothing, B, X) = Constraint(nothing, B, X, BWrapper())
-Constraint(::Nothing, B, X, ::NotBWrapper) = Constraint(nothing, B, X, BWrapper())
-function Constraint(::Nothing, B, X, ::BWrapper)
-    return Constraint{Nothing, Matrix{Nothing}, Matrix{Nothing}, Nothing}(Matrix{Nothing}(undef,0,0), Matrix{Nothing}(undef,0,0), nothing, Matrix{Nothing}(undef,0,0), Matrix{Nothing}(undef,0,0))
+# Create an array of same type as X filled with zeros, minimizing the number
+# of allocations.
+# This is mostly for GPU compatibility, as we sometimes want to build an array of zeros
+# with the same array type as another, ie do zeros(similar(X, dims...)).
+function zeros_like(X::AbstractArray, T::Type=eltype(X), dims::Integer...=size(X)...)
+    Z = similar(X, T, dims...)
+    Z .= 0
+    Z
 end
+zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
+zeros_like(X::Array, T::Type=eltype(X), dims::Integer...=size(X)...) = zeros(T, dims...)
 
-Constraint(Y, B, X) = Constraint(Y, B, X, BWrapper())
-function Constraint(Y, B, X, ::BWrapper)
-    T = eltype(X)
-    if B isa Nothing
-        BY = Y
-    else
-        BY = similar(Y)
-        mul!(BY, B, Y)
-    end
-    return Constraint(Y, BY, X, NotBWrapper())
+"""
+Simple wrapper to represent a matrix formed by the concatenation of column blocks:
+it is mostly equivalent to hcat, but doesn't allocate the full matrix.
+LazyHcat only supports a few multiplication routines: furthermore, a multiplication
+involving this structure will always yield a plain array (and not a LazyHcat structure).
+LazyHcat is a lightweight subset of BlockArrays.jl's functionalities, but has the
+advantage to be able to store GPU Arrays (BlockArrays is heavily built on Julia's CPU Array).
+"""
+struct LazyHcat{T <: Number, D <: Tuple} <: AbstractMatrix{T}
+    blocks::D
 end
-function Constraint(Y, BY, X, ::NotBWrapper)
-    T = eltype(X)
-    if Y isa SubArray
-        gramYBY = @view Matrix{T}(I, size(Y.parent, 2), size(Y.parent, 2))[1:size(Y, 2), 1:size(Y, 2)]
-        mul!(gramYBY, adjoint(Y), BY)
-        gramYBV = @view zeros(T, size(Y.parent, 2), size(X, 2))[1:size(Y, 2), :]
-    else
-        gramYBY = adjoint(Y)*BY
-        gramYBV = zeros(T, size(Y, 2), size(X, 2))
-    end
-    realdiag!(gramYBY)
-    gramYBY_chol = cholesky!(Hermitian(gramYBY))
-    tmp = deepcopy(gramYBV)
 
-    return Constraint{eltype(Y), typeof(Y), typeof(gramYBV), typeof(gramYBY_chol)}(Y, BY, gramYBY_chol, gramYBV, tmp)
-end
+function LazyHcat(arrays::AbstractArray...)
+    @assert length(arrays) != 0
+    n_ref = size(arrays[1], 1)
+    @assert  all(size.(arrays, 1) .== n_ref)
 
-function update!(c::Constraint, X, BX)
-    sizeY = size(c.Y, 2)
-    sizeX = size(X, 2)
-    c.Y.parent[:, sizeY+1:sizeY+sizeX] .= X
-    if X !== BX
-        c.BY.parent[:, sizeY+1:sizeY+sizeX] .= BX
-    end
-    sizeY += sizeX
-    Y = @view c.Y.parent[:, 1:sizeY]
-    BY = @view c.BY.parent[:, 1:sizeY]
-    c.Y = Y
-    c.BY = BY
-    gram_chol = c.gram_chol
-    new_factors = @view gram_chol.factors.parent[1:sizeY, 1:sizeY]
-    c.gram_chol = typeof(gram_chol)(new_factors, gram_chol.uplo, convert(LinearAlgebra.BlasInt, 0))
-    c.gramYBV = @view c.gramYBV.parent[1:sizeY, :]
-    c.tmp = @view c.tmp.parent[1:sizeY, :]
-    return c
-end
+    T = promote_type(map(eltype, arrays)...)
 
-function (constr!::Constraint{Nothing})(X, X_temp)
-    nothing
+    LazyHcat{T, typeof(arrays)}(arrays)
 end
 
-function (constr!::Constraint)(X, X_temp)
-    @views if size(constr!.Y, 2) > 0
-        sizeX = size(X, 2)
-        sizeY = size(constr!.Y, 2)
-        gramYBV_view = constr!.gramYBV[1:sizeY, 1:sizeX]
-        mul!(gramYBV_view, adjoint(constr!.BY), X)
-        tmp_view = constr!.tmp[1:sizeY, 1:sizeX]
-        ldiv!(tmp_view, constr!.gram_chol, gramYBV_view)
-        mul!(X_temp, constr!.Y, tmp_view)
-        @inbounds X .= X .- X_temp
-    end
-    nothing
+function Base.size(A::LazyHcat)
+    n = size(A.blocks[1], 1)
+    m = sum(size(block, 2) for block in A.blocks)
+    (n,m)
 end
 
-struct RPreconditioner{TM, T, TA<:AbstractArray{T}}
-    M::TM
-    buffer::TA
-    RPreconditioner{TM, T, TA}(M, X) where {TM, T, TA<:AbstractArray{T}} = new(M, similar(X))
-end
-RPreconditioner(M, X) = RPreconditioner{typeof(M), eltype(X), typeof(X)}(M, X)
+Base.Array(A::LazyHcat)  = hcat(A.blocks...)
 
-function (precond!::RPreconditioner{Nothing})(X)
-    nothing
-end
-function (precond!::RPreconditioner)(X)
-    bs = size(X, 2)
-    @views ldiv!(precond!.buffer[:, 1:bs], precond!.M, X)
-    # Just returning buffer would be cheaper but struct at call site must be mutable
-    @inbounds @views X .= precond!.buffer[:, 1:bs]
-    nothing
-end
+Base.adjoint(A::LazyHcat) = Adjoint(A)
 
-struct BlockGram{Generalized, TA}
-    XAX::TA
-    XAR::TA
-    XAP::TA
-    RAR::TA
-    RAP::TA
-    PAP::TA
-end
-function BlockGram(XBlocks::Blocks{Generalized, T}) where {Generalized, T}
-    sizeX = size(XBlocks.block, 2)
-    XAX = zeros(T, sizeX, sizeX)
-    XAP = zeros(T, sizeX, sizeX)
-    XAR = zeros(T, sizeX, sizeX)
-    RAR = zeros(T, sizeX, sizeX)
-    RAP = zeros(T, sizeX, sizeX)
-    PAP = zeros(T, sizeX, sizeX)
-    return BlockGram{Generalized, Matrix{T}}(XAX, XAR, XAP, RAR, RAP, PAP)
-end
-XAX!(BlockGram, XBlocks) = mul!(BlockGram.XAX, adjoint(XBlocks.block), XBlocks.A_block)
-XAP!(BlockGram, XBlocks, PBlocks, n) = @views mul!(BlockGram.XAP[:, 1:n], adjoint(XBlocks.block), PBlocks.A_block[:, 1:n])
-XAR!(BlockGram, XBlocks, RBlocks, n) = @views mul!(BlockGram.XAR[:, 1:n], adjoint(XBlocks.block), RBlocks.A_block[:, 1:n])
-RAR!(BlockGram, RBlocks, n) = @views mul!(BlockGram.RAR[1:n, 1:n], adjoint(RBlocks.block[:, 1:n]), RBlocks.A_block[:, 1:n])
-RAP!(BlockGram, RBlocks, PBlocks, n) = @views mul!(BlockGram.RAP[1:n, 1:n], adjoint(RBlocks.A_block[:, 1:n]), PBlocks.block[:, 1:n])
-PAP!(BlockGram, PBlocks, n) = @views mul!(BlockGram.PAP[1:n, 1:n], adjoint(PBlocks.block[:, 1:n]), PBlocks.A_block[:, 1:n])
-XBP!(BlockGram, XBlocks, PBlocks, n) = @views mul!(BlockGram.XAP[:, 1:n], adjoint(XBlocks.block), PBlocks.B_block[:, 1:n])
-XBR!(BlockGram, XBlocks, RBlocks, n) = @views mul!(BlockGram.XAR[:, 1:n], adjoint(XBlocks.block), RBlocks.B_block[:, 1:n])
-RBP!(BlockGram, RBlocks, PBlocks, n) = @views mul!(BlockGram.RAP[1:n, 1:n], adjoint(RBlocks.B_block[:, 1:n]), PBlocks.block[:, 1:n])
-#XBX!(BlockGram, XBlocks) = mul!(BlockGram.XAX, adjoint(XBlocks.block), XBlocks.B_block)
-#RBR!(BlockGram, RBlocks, n) = @views mul!(BlockGram.RAR[1:n, 1:n], adjoint(RBlocks.block[:, 1:n]), RBlocks.B_block[:, 1:n])
-#PBP!(BlockGram, PBlocks, n) = @views mul!(BlockGram.PAP[1:n, 1:n], adjoint(PBlocks.block[:, 1:n]), PBlocks.B_block[:, 1:n])
-
-function I!(G, xr)
-    @inbounds for j in xr, i in xr
-        G[i, j] = ifelse(i==j, 1, 0)
-    end
-    return
-end
+@views function Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::LazyHcat) where {T}
+    A = Aadj.parent
+    rows = size(A)[2]
+    cols = size(B)[2]
+    ret = similar(A.blocks[1], rows, cols)
 
-function (g::BlockGram)(gram, lambda, n1::Int, n2::Int, n3::Int)
-    xr = 1:n1
-    rr = n1+1:n1+n2
-    pr = n1+n2+1:n1+n2+n3
-    @inbounds @views begin
-        if n1 > 0
-            #gram[xr, xr] .= g.XAX[1:n1, 1:n1]
-            gram[xr, xr] .= Diagonal(lambda[1:n1])
+    orow = 0  # row offset
+    for (iA, blA) in enumerate(A.blocks)
+        ocol = 0  # column offset
+        for (iB, blB) in enumerate(B.blocks)
+            ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] .= blA' * blB
+            ocol += size(blB, 2)
         end
-        if n2 > 0
-            gram[rr, rr] .= g.RAR[1:n2, 1:n2]
-            gram[xr, rr] .= g.XAR[1:n1, 1:n2]
-            conj!(transpose!(gram[rr, xr], g.XAR[1:n1, 1:n2]))
-        end
-        if n3 > 0
-            gram[pr, pr] .= g.PAP[1:n3, 1:n3]
-            gram[rr, pr] .= g.RAP[1:n2, 1:n3]
-            gram[xr, pr] .= g.XAP[1:n1, 1:n3]
-            conj!(transpose!(gram[pr, rr], g.RAP[1:n2, 1:n3]))
-            conj!(transpose!(gram[pr, xr], g.XAP[1:n1, 1:n3]))
-        end
-    end
-    return
-end
-function (g::BlockGram)(gram, n1::Int, n2::Int, n3::Int, normalized::Bool=true)
-    xr = 1:n1
-    rr = n1+1:n1+n2
-    pr = n1+n2+1:n1+n2+n3
-    @views if n1 > 0
-        if normalized
-            I!(gram, xr)
-        #else
-        #    @inbounds gram[xr, xr] .= g.XAX[1:n1, 1:n1]
-        end
-    end
-    @views if n2 > 0
-        if normalized
-            I!(gram, rr)
-        #else
-        #    @inbounds gram[rr, rr] .= g.RAR[1:n2, 1:n2]
-        end
-        @inbounds gram[xr, rr] .= g.XAR[1:n1, 1:n2]
-        @inbounds conj!(transpose!(gram[rr, xr], g.XAR[1:n1, 1:n2]))
-    end
-    @views if n3 > 0
-        if normalized
-            I!(gram, pr)
-        #else
-        #    @inbounds gram[pr, pr] .= g.PAP[1:n3, 1:n3]
-        end
-        @inbounds gram[rr, pr] .= g.RAP[1:n2, 1:n3]
-        @inbounds gram[xr, pr] .= g.XAP[1:n1, 1:n3]
-        @inbounds conj!(transpose!(gram[pr, rr], g.RAP[1:n2, 1:n3]))
-        @inbounds conj!(transpose!(gram[pr, xr], g.XAP[1:n1, 1:n3]))
+        orow += size(blA, 2)
     end
-    return
+    ret
 end
 
-abstract type AbstractOrtho end
-struct CholQR{TA} <: AbstractOrtho
-    gramVBV::TA # to be used in view
-end
+Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::AbstractMatrix) where {T} = Aadj * LazyHcat(B)
 
-function rdiv!(A, B::UpperTriangular)
-    s = size(A, 2)
-    @inbounds @views A[:,1] .= A[:, 1] ./ B[1,1]
-    @inbounds @views for i in 2:s
-        for j in 1:i-1
-            A[:,i] .= A[:,i] .- A[:,j] .* B[j,i]
-        end
-        A[:,i] .= A[:,i] ./ B[i,i]
+@views function *(Ablock::LazyHcat, B::AbstractMatrix)
+    res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
+    offset = size(Ablock.blocks[1], 2)
+    for block in Ablock.blocks[2:end]
+        mul!(res, block, B[offset .+ (1:size(block, 2)), :], 1, 1)
+        offset += size(block, 2)
     end
-    return A
+    res
 end
 
-realdiag!(M) = M
-function realdiag!(M::AbstractMatrix{TC}) where TC <: Complex
-    @inbounds for i in 1:minimum(size(M))
-        M[i,i] = real(M[i,i])
-    end
-    return M
+function LinearAlgebra.mul!(res::AbstractMatrix, Ablock::LazyHcat,
+                            B::AbstractVecOrMat, α::Number, β::Number)
+    mul!(res, Ablock*B, I, α, β)
 end
 
-function (ortho!::CholQR)(XBlocks::Blocks{Generalized}, sizeX = -1; update_AX=false, update_BX=false) where Generalized
-    useview = sizeX != -1
-    if sizeX == -1
-        sizeX = size(XBlocks.block, 2)
-    end
-    X = XBlocks.block
-    BX = XBlocks.B_block # Assumes it is premultiplied
-    AX = XBlocks.A_block
-    @views gram_view = ortho!.gramVBV[1:sizeX, 1:sizeX]
-    @views if useview
-        mul!(gram_view, adjoint(X[:, 1:sizeX]), BX[:, 1:sizeX])
-    else
-        mul!(gram_view, adjoint(X), BX)
-    end
-    realdiag!(gram_view)
-    cholf = cholesky!(Hermitian(gram_view))
-    R = cholf.factors
-    @views if useview
-        rdiv!(X[:, 1:sizeX], UpperTriangular(R))
-        update_AX && rdiv!(AX[:, 1:sizeX], UpperTriangular(R))
-        Generalized && update_BX && rdiv!(BX[:, 1:sizeX], UpperTriangular(R))
+# Perform a Rayleigh-Ritz for the first (or last) N eigenvectors.
+function rayleigh_ritz(X, AX, N, largest)
+    XAX = X' * AX
+    @assert all(!isnan, XAX)
+    F = eigen(Hermitian(XAX))
+    if largest
+        start = size(XAX, 1) - N + 1
+        F.vectors[:,start:end], F.values[start:end]
     else
-        rdiv!(X, UpperTriangular(R))
-        update_AX && rdiv!(AX, UpperTriangular(R))
-        Generalized && update_BX && rdiv!(BX, UpperTriangular(R))
-    end
+        F.vectors[:,1:N], F.values[1:N]
+    end
+end
+
+# B-orthogonalize X (in place) using only one B apply.
+# This uses an unstable method which is only OK if X is already
+# orthogonal (not B-orthogonal) and B is relatively well-conditioned
+# (which implies that X'BX is relatively well-conditioned, and
+# therefore that it is safe to cholesky it and reuse the B apply)
+function B_ortho!(X, BX)
+    O = Hermitian(X'*BX)
+    U = cholesky(O).U
+    @assert all(!isnan, U)
+    rdiv!(X, U)
+    rdiv!(BX, U)
+end
+
+normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
+# Orthogonalizes X to tol
+# Returns the new X, the number of Cholesky factorizations algorithm, and the
+# growth factor by which small perturbations of X can have been
+# magnified
+function ortho!(X::AbstractArray{T}; tol=2eps(real(T))) where {T}
+    local R
+
+    # # Uncomment for "gold standard"
+    # U,S,V = svd(X)
+    # return U*V', 1, 1
+
+    growth_factor = 1
+
+    success = false
+    nchol = 0
+    while true
+        O = Hermitian(X'X)
+        try
+            R = cholesky(O).U
+            nchol += 1
+            success = true
+        catch err
+            @assert isa(err, PosDefException)
+            vprintln("fail")
+            # see https://arxiv.org/pdf/1809.11085.pdf for a nice analysis
+            # We are not being very clever here; but this should very rarely happen
+            # so it should be OK
+            α = 100
+            nbad = 0
+            while true
+                O += α*eps(real(T))*norm(X)^2*I
+                α *= 10
+                try
+                    R = cholesky(O).U
+                    nchol += 1
+                    break
+                catch err
+                    @assert isa(err, PosDefException)
+                end
+                nbad += 1
+                if nbad > 10
+                    error("Cholesky shifting is failing badly, this should never happen")
+                end
+            end
+            success = false
+        end
+        invR = inv(R)
+        @assert all(!isnan, invR)
+        rmul!(X, invR)  # we do not use X/R because we use invR next
 
-    return
-end
+        # We would like growth_factor *= opnorm(inv(R)) but it's too
+        # expensive, so we use an upper bound which is sharp enough to
+        # be close to 1 when R is close to I, which we need for the
+        # outer loop to terminate
+        # ||invR|| = ||D + E|| ≤ ||D|| + ||E|| ≤ ||D|| + ||E||_F,
+        # where ||.|| is the 2-norm and ||.||_F the Frobenius
 
-struct LOBPCGIterator{Generalized, T, TA, TB, TL<:AbstractVector{T}, TR<:AbstractVector, TPerm<:AbstractVector{Int}, TV<:AbstractArray{T}, TBlocks<:Blocks{Generalized, T}, TO<:AbstractOrtho, TP, TC, TG, TM, TTrace}
-    A::TA
-    B::TB
-    ritz_values::TL
-    λperm::TPerm
-    λ::TL
-    V::TV
-    residuals::TR
-    largest::Bool
-    XBlocks::TBlocks
-    tempXBlocks::TBlocks
-    PBlocks::TBlocks
-    activePBlocks::TBlocks # to be used in view
-    RBlocks::TBlocks
-    activeRBlocks::TBlocks # to be used in view
-    iteration::Base.RefValue{Int}
-    currentBlockSize::Base.RefValue{Int}
-    ortho!::TO
-    precond!::TP
-    constr!::TC
-    gramABlock::TG
-    gramBBlock::TG
-    gramA::TV
-    gramB::TV
-    activeMask::TM
-    trace::TTrace
-end
+        norminvR = normest(invR)
+        growth_factor *= norminvR
 
-"""
-    LOBPCGIterator(A, largest::Bool, X, P=nothing, C=nothing) -> iterator
+        # condR = 1/LAPACK.trcon!('I', 'U', 'N', Array(R))
+        condR = normest(R)*norminvR  # in practice this seems to be an OK estimate
 
-# Arguments
+        vprintln("Ortho(X) success? $success ", eps(real(T))*condR^2, " < $tol")
 
-- `A`: linear operator;
-- `X`: initial guess of the Ritz vectors; to be overwritten by the eigenvectors;
-- `largest`: `true` if largest eigenvalues are desired and false if smallest;
-- `P`: preconditioner of residual vectors, must overload `ldiv!`;
-- `C`: constraint to deflate the residual and solution vectors orthogonal
-    to a subspace; must overload `mul!`.
-"""
-LOBPCGIterator(A, largest::Bool, X, P=nothing, C=nothing) = LOBPCGIterator(A, nothing, largest, X, P, C)
+        # a good a posteriori error is that X'X - I is eps()*κ(R)^2;
+        # in practice this seems to be sometimes very overconservative
+        success && eps(real(T))*condR^2 < tol && break
 
-"""
-    LOBPCGIterator(A, B, largest::Bool, X, P=nothing, C=nothing) -> iterator
+        nchol > 10 && error("Ortho(X) is failing badly, this should never happen")
+    end
 
-# Arguments
+    # @assert norm(X'X - I) < tol
 
-- `A`: linear operator;
-- `B`: linear operator;
-- `X`: initial guess of the Ritz vectors; to be overwritten by the eigenvectors;
-- `largest`: `true` if largest eigenvalues are desired and false if smallest;
-- `P`: preconditioner of residual vectors, must overload `ldiv!`;
-- `C`: constraint to deflate the residual and solution vectors orthogonal
-    to a subspace; must overload `mul!`;
-"""
-function LOBPCGIterator(A, B, largest::Bool, X, P=nothing, C=nothing)
-    precond! = RPreconditioner(P, X)
-    constr! = Constraint(C, B, X)
-    return LOBPCGIterator(A, B, largest, X, precond!, constr!)
-end
-function LOBPCGIterator(A, B, largest::Bool, X, precond!::RPreconditioner, constr!::Constraint)
-    T = eltype(X)
-    nev = size(X, 2)
-    if B isa Nothing
-        XBlocks = Blocks(X, similar(X))
-        tempXBlocks = Blocks(copy(X), similar(X))
-        RBlocks = Blocks(similar(X), similar(X))
-        activeRBlocks = Blocks(similar(X), similar(X))
-        PBlocks = Blocks(similar(X), similar(X))
-        activePBlocks = Blocks(similar(X), similar(X))
-    else
-        XBlocks = Blocks(X, similar(X), similar(X))
-        tempXBlocks = Blocks(copy(X), similar(X), similar(X))
-        RBlocks = Blocks(similar(X), similar(X), similar(X))
-        activeRBlocks = Blocks(similar(X), similar(X), similar(X))
-        PBlocks = Blocks(similar(X), similar(X), similar(X))
-        activePBlocks = Blocks(similar(X), similar(X), similar(X))
-    end
-    ritz_values = zeros(T, nev*3)
-    λ = zeros(T, nev)
-    λperm = zeros(Int, nev*3)
-    V = zeros(T, nev*3, nev*3)
-    residuals = fill(real(T)(NaN), nev)
-    iteration = Ref(1)
-    currentBlockSize = Ref(nev)
-    generalized = !(B isa Nothing)
-    ortho! = CholQR(zeros(T, nev, nev))
-
-    gramABlock = BlockGram(XBlocks)
-    gramBBlock = BlockGram(XBlocks)
-
-    gramA = zeros(T, 3*nev, 3*nev)
-    gramB = zeros(T, 3*nev, 3*nev)
-
-    activeMask = ones(Bool, nev)
-    trace = LOBPCGTrace{Vector{real(T)},Vector{T}}()
-
-    return LOBPCGIterator{generalized, T, typeof(A), typeof(B), typeof(λ), typeof(residuals), typeof(λperm), typeof(V), typeof(XBlocks), typeof(ortho!), typeof(precond!), typeof(constr!), typeof(gramABlock), typeof(activeMask), typeof(trace)}(A, B, ritz_values, λperm, λ, V, residuals, largest, XBlocks, tempXBlocks, PBlocks, activePBlocks, RBlocks, activeRBlocks, iteration, currentBlockSize, ortho!, precond!, constr!, gramABlock, gramBBlock, gramA, gramB, activeMask, trace)
-end
-function LOBPCGIterator(A, largest::Bool, X, nev::Int, P=nothing, C=nothing)
-    LOBPCGIterator(A, nothing, largest, X, nev, P, C)
-end
-function LOBPCGIterator(A, B, largest::Bool, X, nev::Int, P=nothing, C=nothing)
-    T = eltype(X)
-    n = size(X, 1)
-    sizeX = size(X, 2)
-    if C isa Nothing
-        sizeC = 0
-        new_C = typeof(X)(undef, n, (nev÷sizeX)*sizeX)
-    else
-        sizeC = size(C,2)
-        new_C = typeof(C)(undef, n, sizeC+(nev÷sizeX)*sizeX)
-        new_C[:,1:sizeC] .= C
-    end
-    if B isa Nothing
-        new_BC = new_C
-    else
-        new_BC = similar(new_C)
-    end
-    Y = @view new_C[:, 1:sizeC]
-    BY = @view new_BC[:, 1:sizeC]
-    if !(B isa Nothing)
-        mul!(BY, B, Y)
-    end
-    constr! = Constraint(Y, BY, X, NotBWrapper())
-    precond! = RPreconditioner(P, X)
-    return LOBPCGIterator(A, B, largest, X, precond!, constr!)
+    X, nchol, growth_factor
 end
 
-function ortho_AB_mul_X!(blocks::Blocks, ortho!, A, B, bs=-1)
-    # Finds BX
-    bs == -1 ? B_mul_X!(blocks, B) : B_mul_X!(blocks, B, bs)
-    # Orthonormalizes X and updates BX
-    bs == -1 ? ortho!(blocks, update_BX=true) : ortho!(blocks, bs, update_BX=true)
-    # Updates AX
-    bs == -1 ? A_mul_X!(blocks, A) : A_mul_X!(blocks, A, bs)
-    return
-end
-function residuals!(iterator)
-    sizeX = size(iterator.XBlocks.block, 2)
-    @views mul!(iterator.RBlocks.block, iterator.XBlocks.B_block, Diagonal(iterator.ritz_values[1:sizeX]))
-    @inbounds iterator.RBlocks.block .= iterator.XBlocks.A_block .- iterator.RBlocks.block
-    # Finds residual norms
-    @inbounds for j in 1:size(iterator.RBlocks.block, 2)
-        iterator.residuals[j] = 0
-        for i in 1:size(iterator.RBlocks.block, 1)
-            x = iterator.RBlocks.block[i,j]
-            iterator.residuals[j] += real(x*conj(x))
+# Randomize the columns of X if the norm is below tol
+function drop!(X::AbstractArray{T}, tol=2eps(real(T))) where {T}
+    dropped = Int[]
+    for i=1:size(X,2)
+        n = norm(@views X[:,i])
+        if n <= tol
+            X[:,i] = randn(T, size(X,1))
+            push!(dropped, i)
         end
-        iterator.residuals[j] = sqrt(iterator.residuals[j])
     end
-    return
-end
-
-function update_mask!(iterator, residualTolerance)
-    sizeX = size(iterator.XBlocks.block, 2)
-    # Update active vectors mask
-    @inbounds @views iterator.activeMask .= iterator.residuals[1:sizeX] .> residualTolerance
-    iterator.currentBlockSize[] = sum(iterator.activeMask)
-    return
+    dropped
 end
 
-function update_active!(mask, bs::Int, blockPairs...)
-    @inbounds @views for (activeblock, block) in blockPairs
-        activeblock[:, 1:bs] .= block[:, mask]
+# Find X that is orthogonal, and B-orthogonal to Y, up to a tolerance tol.
+function ortho!(X::AbstractArray{T}, Y, BY; tol=2eps(real(T))) where {T}
+    # normalize to try to cheaply improve conditioning
+    Threads.@threads for i=1:size(X,2)
+        n = norm(@views X[:,i])
+        @views X[:,i] ./= n
     end
-    return
-end
 
-function precond_constr!(block, temp_block, bs, precond!, constr!)
-    @views precond!(block[:, 1:bs])
-    # Constrain the active residual vectors to be B-orthogonal to Y
-    @views constr!(block[:, 1:bs], temp_block[:, 1:bs])
-    return
-end
-function block_grams_1x1!(iterator)
-    # Finds gram matrix X'AX
-    XAX!(iterator.gramABlock, iterator.XBlocks)
-    return
-end
-function block_grams_2x2!(iterator, bs)
-    sizeX = size(iterator.XBlocks.block, 2)
-    #XAX!(iterator.gramABlock, iterator.XBlocks)
-    XAR!(iterator.gramABlock, iterator.XBlocks, iterator.activeRBlocks, bs)
-    RAR!(iterator.gramABlock, iterator.activeRBlocks, bs)
-    XBR!(iterator.gramBBlock, iterator.XBlocks, iterator.activeRBlocks, bs)
-    @views iterator.gramABlock(iterator.gramA, iterator.ritz_values[1:sizeX], sizeX, bs, 0)
-    iterator.gramBBlock(iterator.gramB, sizeX, bs, 0, true)
-
-    return
-end
-function block_grams_3x3!(iterator, bs)
-    # Find R'AR, P'AP, X'AR, X'AP and R'AP
-    sizeX = size(iterator.XBlocks.block, 2)
-    #XAX!(iterator.gramABlock, iterator.XBlocks)
-    XAR!(iterator.gramABlock, iterator.XBlocks, iterator.activeRBlocks, bs)
-    XAP!(iterator.gramABlock, iterator.XBlocks, iterator.activePBlocks, bs)
-    RAR!(iterator.gramABlock, iterator.activeRBlocks, bs)
-    RAP!(iterator.gramABlock, iterator.activeRBlocks, iterator.activePBlocks, bs)
-    PAP!(iterator.gramABlock, iterator.activePBlocks, bs)
-    # Find X'BR, X'BP and P'BR
-    XBR!(iterator.gramBBlock, iterator.XBlocks, iterator.activeRBlocks, bs)
-    XBP!(iterator.gramBBlock, iterator.XBlocks, iterator.activePBlocks, bs)
-    RBP!(iterator.gramBBlock, iterator.activeRBlocks, iterator.activePBlocks, bs)
-    # Update the gram matrix [X R P]' A [X R P]
-    @views iterator.gramABlock(iterator.gramA, iterator.ritz_values[1:sizeX], sizeX, bs, bs)
-    # Update the gram matrix [X R P]' B [X R P]
-    iterator.gramBBlock(iterator.gramB, sizeX, bs, bs, true)
-
-    return
-end
-
-function sub_problem!(iterator, sizeX, bs1, bs2)
-    subdim = sizeX+bs1+bs2
-    @views if bs1 == 0
-        gramAview = iterator.gramABlock.XAX[1:subdim, 1:subdim]
-        # Source of type instability
-        realdiag!(gramAview)
-        eigf = eigen!(Hermitian(gramAview))
-    else
-        gramAview = iterator.gramA[1:subdim, 1:subdim]
-        gramBview = iterator.gramB[1:subdim, 1:subdim]
-        # Source of type instability
-        realdiag!(gramAview)
-        realdiag!(gramBview)
-        eigf = eigen!(Hermitian(gramAview), Hermitian(gramBview))
-    end
-    # Selects extremal eigenvalues and corresponding vectors
-    @views partialsortperm!(iterator.λperm[1:subdim], eigf.values, 1:subdim; rev=iterator.largest)
-    @inbounds @views iterator.ritz_values[1:sizeX] .= eigf.values[iterator.λperm[1:sizeX]]
-    @inbounds @views iterator.V[1:subdim, 1:sizeX] .= eigf.vectors[:, iterator.λperm[1:sizeX]]
-    return
-end
-
-function update_X_P!(iterator::LOBPCGIterator{Generalized}, bs1, bs2) where Generalized
-    sizeX = size(iterator.XBlocks.block, 2)
-    @views begin
-        x_eigview = iterator.V[1:sizeX, 1:sizeX]
-        r_eigview = iterator.V[sizeX+1:sizeX+bs1, 1:sizeX]
-        p_eigview = iterator.V[sizeX+bs1+1:sizeX+bs1+bs2, 1:sizeX]
-        r_blockview = iterator.activeRBlocks.block[:, 1:bs1]
-        ra_blockview = iterator.activeRBlocks.A_block[:, 1:bs1]
-        p_blockview = iterator.activePBlocks.block[:, 1:bs2]
-        pa_blockview = iterator.activePBlocks.A_block[:, 1:bs2]
-        if Generalized
-            rb_blockview = iterator.activeRBlocks.B_block[:, 1:bs1]
-            pb_blockview = iterator.activePBlocks.B_block[:, 1:bs2]
-        end
-    end
-    if bs1 > 0
-        mul!(iterator.PBlocks.block, r_blockview, r_eigview)
-        mul!(iterator.PBlocks.A_block, ra_blockview, r_eigview)
-        if Generalized
-            mul!(iterator.PBlocks.B_block, rb_blockview, r_eigview)
-        end
-    end
-    if bs2 > 0
-        mul!(iterator.tempXBlocks.block, p_blockview, p_eigview)
-        mul!(iterator.tempXBlocks.A_block, pa_blockview, p_eigview)
-        if Generalized
-            mul!(iterator.tempXBlocks.B_block, pb_blockview, p_eigview)
-        end
-        @inbounds iterator.PBlocks.block .= iterator.PBlocks.block .+ iterator.tempXBlocks.block
-        @inbounds iterator.PBlocks.A_block .= iterator.PBlocks.A_block .+ iterator.tempXBlocks.A_block
-        if Generalized
-            @inbounds iterator.PBlocks.B_block .= iterator.PBlocks.B_block .+ iterator.tempXBlocks.B_block
-        end
-    end
-    block = iterator.XBlocks.block
-    tempblock = iterator.tempXBlocks.block
-    mul!(tempblock, block, x_eigview)
-    block = iterator.XBlocks.A_block
-    tempblock = iterator.tempXBlocks.A_block
-    mul!(tempblock, block, x_eigview)
-    if Generalized
-        block = iterator.XBlocks.B_block
-        tempblock = iterator.tempXBlocks.B_block
-        mul!(tempblock, block, x_eigview)
-    end
-    @inbounds begin
-        if bs1 > 0
-            iterator.XBlocks.block .= iterator.tempXBlocks.block .+ iterator.PBlocks.block
-            iterator.XBlocks.A_block .= iterator.tempXBlocks.A_block .+ iterator.PBlocks.A_block
-            if Generalized
-                iterator.XBlocks.B_block .= iterator.tempXBlocks.B_block .+ iterator.PBlocks.B_block
-            end
-        else
-            iterator.XBlocks.block .= iterator.tempXBlocks.block
-            iterator.XBlocks.A_block .= iterator.tempXBlocks.A_block
-            if Generalized
-                iterator.XBlocks.B_block .= iterator.tempXBlocks.B_block
-            end
+    niter = 1
+    ninners = zeros(Int,0)
+    while true
+        BYX = BY' * X
+        mul!(X, Y, BYX, -1, 1)  # X -= Y*BY'X
+        # If the orthogonalization has produced results below 2eps, we drop them
+        # This is to be able to orthogonalize eg [1;0] against [e^iθ;0],
+        # as can happen in extreme cases in the ortho!(cP, cX)
+        dropped = drop!(X)
+        if dropped != []
+            X[:, dropped] .-= Y * (BY' * X[:, dropped])
         end
-    end
-    return
-end
 
-function (iterator::LOBPCGIterator{Generalized})(residualTolerance, log) where {Generalized}
-    sizeX = size(iterator.XBlocks.block, 2)
-    iteration = iterator.iteration[]
-    if iteration == 1
-        ortho_AB_mul_X!(iterator.XBlocks, iterator.ortho!, iterator.A, iterator.B)
-        # Finds gram matrix X'AX
-        block_grams_1x1!(iterator)
-        sub_problem!(iterator, sizeX, 0, 0)
-        # Updates Ritz vectors X and updates AX and BX accordingly
-        update_X_P!(iterator, 0, 0)
-        residuals!(iterator)
-        update_mask!(iterator, residualTolerance)
-    elseif iteration == 2
-        bs = iterator.currentBlockSize[]
-        # Update active R blocks
-        update_active!(iterator.activeMask, bs, (iterator.activeRBlocks.block, iterator.RBlocks.block))
-        # Precondition and constrain the active residual vectors
-        precond_constr!(iterator.activeRBlocks.block, iterator.tempXBlocks.block, bs, iterator.precond!, iterator.constr!)
-        # Orthonormalizes R[:,1:bs] and finds AR[:,1:bs] and BR[:,1:bs]
-        ortho_AB_mul_X!(iterator.activeRBlocks, iterator.ortho!, iterator.A, iterator.B, bs)
-        # Find [X R] A [X R] and [X R]' B [X R]
-        block_grams_2x2!(iterator, bs)
-        # Solve the Rayleigh-Ritz sub-problem
-        sub_problem!(iterator, sizeX, bs, 0)
-        update_X_P!(iterator, bs, 0)
-        residuals!(iterator)
-        update_mask!(iterator, residualTolerance)
-    else
-        # Update active blocks
-        bs = iterator.currentBlockSize[]
-        # Update active R and P blocks
-        update_active!(iterator.activeMask, bs,
-            (iterator.activeRBlocks.block, iterator.RBlocks.block),
-            (iterator.activePBlocks.block, iterator.PBlocks.block),
-            (iterator.activePBlocks.A_block, iterator.PBlocks.A_block),
-            (iterator.activePBlocks.B_block, iterator.PBlocks.B_block))
-        # Precondition and constrain the active residual vectors
-        precond_constr!(iterator.activeRBlocks.block, iterator.tempXBlocks.block, bs, iterator.precond!, iterator.constr!)
-        # Orthonormalizes R[:,1:bs] and finds AR[:,1:bs] and BR[:,1:bs]
-        ortho_AB_mul_X!(iterator.activeRBlocks, iterator.ortho!, iterator.A, iterator.B, bs)
-        # Orthonormalizes P and updates AP
-        iterator.ortho!(iterator.activePBlocks, bs, update_AX=true, update_BX=true)
-        # Update the gram matrix [X R P]' A [X R P] and [X R P]' B [X R P]
-        block_grams_3x3!(iterator, bs)
-        # Solve the Rayleigh-Ritz sub-problem
-        sub_problem!(iterator, sizeX, bs, bs)
-        # Updates Ritz vectors X and updates AX and BX accordingly
-        # And updates P, AP and BP
-        update_X_P!(iterator, bs, bs)
-        residuals!(iterator)
-        update_mask!(iterator, residualTolerance)
-    end
-    if log
-        return LOBPCGState(iteration, iterator.residuals[1:sizeX], iterator.ritz_values[1:sizeX])
-    else
-        return LOBPCGState(iteration, nothing, nothing)
-    end
+        if norm(BYX) < tol && niter > 1
+            push!(ninners, 0)
+            break
+        end
+        X, ninner, growth_factor = ortho!(X, tol=tol)
+        push!(ninners, ninner)
+
+        # norm(BY'X) < tol && break should be the proper check, but
+        # usually unnecessarily costly. Instead, we predict the error
+        # according to the growth factor of the orthogonalization.
+        # Assuming BY'Y = 0 to machine precision, after the
+        # Y-orthogonalization, BY'X is O(eps), so BY'X will be bounded
+        # by O(eps * growth_factor).
+
+        # If we're at a fixed point, growth_factor is 1 and if tol >
+        # eps(), the loop will terminate, even if BY'Y != 0
+        growth_factor*eps(real(T)) < tol && break
+
+        niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen")
+        niter += 1
+    end
+    vprintln("ortho choleskys: ", ninners)  # get how many Choleskys are performed
+
+    # @assert (norm(BY'X)) < tol
+    # @assert (norm(X'X-I)) < tol
+
+    X
+end
+
+# Return the results of the computations in a nicer struct.
+function final_retval(X, AX, tolerance, resid_history, niter, maxiter, converged,
+    largest, n_matvec)
+    λ = real(diag(X' * AX))
+    residuals = AX .- X*Diagonal(λ)
+    LOBPCGResults(
+        λ,
+        X,
+        tolerance,
+        [norm(residuals[:, i]) for i in 1:size(residuals, 2)],
+        resid_history[:, 1:niter+1],
+        niter,
+        maxiter,
+        converged,
+        largest,
+        n_matvec
+    )
 end
 
 default_tolerance(::Type{T}) where {T<:Number} = eps(real(T))^(real(T)(3)/10)
+default_tolerance(::Type{Float64}) = 1e-10
 
-"""
-The Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)
-
-Finds the `nev` extremal eigenvalues and their corresponding eigenvectors satisfying `AX = λBX`.
-
-`A` and `B` may be generic types but `Base.mul!(C, AorB, X)` must be defined for vectors and strided matrices `X` and `C`. `size(A, i::Int)` and `eltype(A)` must also be defined for `A`.
-
-    lobpcg(A, [B,] largest, nev; kwargs...) -> results
-
-# Arguments
-
-- `A`: linear operator;
-- `B`: linear operator;
-- `largest`: `true` if largest eigenvalues are desired and false if smallest;
-- `nev`: number of eigenvalues desired.
-
-## Keywords
-
-- `log::Bool`: default is `false`; if `true`, `results.trace` will store iterations
-    states; if `false` only `results.trace` will be empty;
-
-- `P`: preconditioner of residual vectors, must overload `ldiv!`;
-
-- `C`: constraint to deflate the residual and solution vectors orthogonal
-    to a subspace; must overload `mul!`;
-
-- `maxiter`: maximum number of iterations; default is 200;
-
-- `tol::Real`: tolerance to which residual vector norms must be under.
-
-# Output
-
-- `results`: a `LOBPCGResults` struct. `r.λ` and `r.X` store the eigenvalues and eigenvectors.
-"""
 function lobpcg(A, largest::Bool, nev::Int; kwargs...)
-    lobpcg(A, nothing, largest, nev; kwargs...)
+    lobpcg(A, I, largest, nev; kwargs...)
 end
+
 function lobpcg(A, B, largest::Bool, nev::Int; kwargs...)
     lobpcg(A, B, largest, rand(eltype(A), size(A, 1), nev); not_zeros=true, kwargs...)
 end
 
-"""
-    lobpcg(A, [B,] largest, X0; kwargs...) -> results
-
-# Arguments
-
-- `A`: linear operator;
-- `B`: linear operator;
-- `largest`: `true` if largest eigenvalues are desired and false if smallest;
-- `X0`: Initial guess, will not be modified. The number of columns is the number of eigenvectors desired.
-
-## Keywords
-
-- `not_zeros`: default is `false`. If `true`, `X0` will be assumed to not have any all-zeros column.
-
-- `log::Bool`: default is `false`; if `true`, `results.trace` will store iterations
-    states; if `false` only `results.trace` will be empty;
-
-- `P`: preconditioner of residual vectors, must overload `ldiv!`;
-
-- `C`: constraint to deflate the residual and solution vectors orthogonal
-    to a subspace; must overload `mul!`;
-
-- `maxiter`: maximum number of iterations; default is 200;
-
-- `tol::Real`: tolerance to which residual vector norms must be under.
-
-# Output
-
-- `results`: a `LOBPCGResults` struct. `r.λ` and `r.X` store the eigenvalues and eigenvectors.
-"""
 function lobpcg(A, largest::Bool, X0; kwargs...)
-    lobpcg(A, nothing, largest, X0; kwargs...)
-end
-function lobpcg(A, B, largest, X0;
-                not_zeros=false, log=false, P=nothing, maxiter=200,
-                C=nothing, tol::Real=default_tolerance(eltype(X0)))
-    X = copy(X0)
-    n = size(X, 1)
-    sizeX = size(X, 2)
-    sizeX > n && throw("X column dimension exceeds the row dimension")
-    3*sizeX > n && throw("The LOBPCG algorithms is not stable to use when the matrix size is less than 3 times the block size. Please use a dense solver instead.")
-
-    iterator = LOBPCGIterator(A, B, largest, X, P, C)
-
-    return lobpcg!(iterator, log=log, tol=tol, maxiter=maxiter, not_zeros=not_zeros)
-end
-
-"""
-    lobpcg!(iterator::LOBPCGIterator; kwargs...) -> results
-
-# Arguments
-
-- `iterator::LOBPCGIterator`: a struct having all the variables required
-    for the LOBPCG algorithm.
-
-## Keywords
-
-- `not_zeros`: default is `false`. If `true`, the initial Ritz vectors will be assumed to not have any all-zeros column.
-
-- `log::Bool`: default is `false`; if `true`, `results.trace` will store iterations
-    states; if `false` only `results.trace` will be empty;
-
-- `maxiter`: maximum number of iterations; default is 200;
-
-- `tol::Real`: tolerance to which residual vector norms must be under.
-
-# Output
-
-- `results`: a `LOBPCGResults` struct. `r.λ` and `r.X` store the eigenvalues and eigenvectors.
-
-"""
-function lobpcg!(iterator::LOBPCGIterator; log=false, maxiter=200, not_zeros=false,
-                 tol::Real=default_tolerance(eltype(iterator.XBlocks.block)))
-    X = iterator.XBlocks.block
-    iterator.constr!(iterator.XBlocks.block, iterator.tempXBlocks.block)
-    if !not_zeros
-        @views for j in 1:size(X,2)
-            if all(x -> x==0, X[:, j])
-                @inbounds X[:,j] .= rand.()
+    lobpcg(A, I, largest, X0; kwargs...)
+end
+
+### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
+### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P)
+### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y
+### R is then recomputed, and orthonormalized explicitly wrt BX and BP
+### We reuse applications of A/B when it is safe to do so, ie only with orthogonal transformations
+function lobpcg(A, B, largest, X;
+                precon=I, maxiter=200, miniter=1, tol::Real=default_tolerance(eltype(X)),
+                ortho_tol=2eps(real(eltype(X))), n_conv_check=nothing, display_progress=false)
+    N, M = size(X)
+
+    # If N is too small, we will likely get in trouble
+    error_message(verb) = "The eigenproblem is too small, and the iterative " *
+                           "eigensolver $verb fail; increase the number of " *
+                           "degrees of freedom, or use a dense eigensolver."
+    N > 3M    || error(error_message("will"))
+    N >= 3M+5 || @warn error_message("might")
+
+    n_conv_check === nothing && (n_conv_check = M)
+    resid_history = zeros(real(eltype(X)), M, maxiter+1)
+
+    # B-orthogonalize X
+    X = ortho!(copy(X), tol=ortho_tol)[1]
+    if B != I
+        BX = similar(X)
+        BX = mul!(BX, B, X)
+        B_ortho!(X, BX)
+    end
+
+    n_matvec = M  # Count number of matrix-vector products
+    AX = similar(X)
+    AX = mul!(AX, A, X)
+    @assert all(!isnan, AX)
+    # full_X/AX/BX will always store the full (including locked) X.
+    # X/AX/BX only point to the active part
+    P = zero(X)
+    AP = zero(X)
+    R = zero(X)
+    AR = zero(X)
+    if B != I
+        BR = zero(X)
+        # BX was already computed
+        BP = zero(X)
+    else
+        # The B arrays are just pointers to the same data
+        BR = R
+        BX = X
+        BP = P
+    end
+    nlocked = 0
+    niter = 0  # the first iteration is fake
+    λs = @views [(X[:, n]'*AX[:, n]) / (X[:, n]'BX[:, n]) for n=1:M]
+    λs = oftype(X[:, 1], λs)  # GPU computation only: offload to GPU
+    new_X = X
+    new_AX = AX
+    new_BX = BX
+    full_X = X
+    full_AX = AX
+    full_BX = BX
+
+    while true
+        if niter > 0  # first iteration is just to compute the residuals (no X update)
+            ###  Perform the Rayleigh-Ritz
+            mul!(AR, A, R)
+            n_matvec += size(R, 2)
+
+            # Form Rayleigh-Ritz subspace
+            if niter > 1
+                Y = LazyHcat(X, R, P)
+                AY = LazyHcat(AX, AR, AP)
+                BY = LazyHcat(BX, BR, BP)  # data shared with (X, R, P) in non-general case
+            else
+                Y  = LazyHcat(X, R)
+                AY = LazyHcat(AX, AR)
+                BY = LazyHcat(BX, BR)  # data shared with (X, R) in non-general case
             end
+            cX, λs = rayleigh_ritz(Y, AY, M-nlocked, largest)
+
+            # Update X. By contrast to some other implementations, we
+            # wait on updating P because we have to know which vectors
+            # to lock (and therefore the residuals) before computing P
+            # only for the unlocked vectors. This results in better convergence.
+            new_X  = Y * cX
+            new_AX = AY * cX  # no accuracy loss, since cX orthogonal
+            new_BX = (B == I) ? new_X : BY * cX
         end
-        iterator.constr!(iterator.XBlocks.block, iterator.tempXBlocks.block)
-    end
-    n = size(X, 1)
-    sizeX = size(X, 2)
-    iterator.iteration[] = 1
-    while iterator.iteration[] <= maxiter 
-        state = iterator(tol, log)
-        if log
-            push!(iterator.trace, state)
-        end
-        iterator.currentBlockSize[] == 0 && break
-        iterator.iteration[] += 1
-    end
-    @inbounds @views iterator.λ .= iterator.ritz_values[1:sizeX]
-
-    @views results = LOBPCGResults(iterator.λ, X, tol, iterator.residuals, iterator.iteration[], maxiter, all((x)->(norm(x)<=tol), iterator.residuals[1:sizeX]), iterator.trace)
 
-    return results
-end
-
-"""
-lobpcg(A, [B,] largest, X0, nev; kwargs...) -> results
-
-# Arguments
-
-- `A`: linear operator;
-- `B`: linear operator;
-- `largest`: `true` if largest eigenvalues are desired and false if smallest;
-- `X0`: block vectors such that the eigenvalues will be found size(X0, 2) at a time;
-    the columns are also used to initialize the first batch of Ritz vectors;
-- `nev`: number of eigenvalues desired.
+        ### Compute new residuals
+        new_R = new_AX .- new_BX .* λs'
+        # it is actually a good question of knowing when to
+        # precondition. Here seems sensible, but it could plausibly be
+        # done before or after
+        @views for i=1:size(X,2)
+            resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
+        end
+        vprintln(niter, "   ", resid_history[:, niter+1])
+        if precon !== I
+            ldiv!(precon, new_R)
+        end
 
-## Keywords
+        ### Compute number of locked vectors
+        prev_nlocked = nlocked
+        if niter ≥ miniter  # No locking if below miniter
+            for i=nlocked+1:M
+                if resid_history[i, niter+1] < tol
+                    nlocked += 1
+                    vprintln("locked $nlocked")
+                else
+                    # We lock in order, assuming that the lowest
+                    # eigenvectors converge first; might be tricky otherwise
+                    break
+                end
+            end
+        end
 
-- `log::Bool`: default is `false`; if `true`, `results.trace` will store iterations
-    states; if `false` only `results.trace` will be empty;
+        if display_progress
+            println("Iter $niter, converged $(nlocked)/$(n_conv_check), resid ",
+                    norm(resid_history[1:n_conv_check, niter+1]))
+        end
 
-- `P`: preconditioner of residual vectors, must overload `ldiv!`;
+        if nlocked >= n_conv_check  # Converged!
+            X .= new_X  # Update the part of X which is still active
+            AX .= new_AX
+            return final_retval(full_X, full_AX, tol, resid_history, niter, maxiter, true,
+                                largest,n_matvec)
+        end
+        newly_locked = nlocked - prev_nlocked
+        active = newly_locked+1:size(X,2)  # newly active vectors
+
+        if niter > 0
+            ### compute P = Y*cP only for the newly active vectors
+            Xn_indices = newly_locked+1:M-prev_nlocked
+            # TODO understand this for a potential save of an
+            # orthogonalization, see Hetmaniuk & Lehoucq, and Duersch et. al.
+            # cP = copy(cX)
+            # cP[Xn_indices,:] .= 0
+
+            lenXn = length(Xn_indices)
+            e = zeros_like(X, size(cX, 1), M - prev_nlocked)
+            lower_diag = one(similar(X, lenXn, lenXn))
+            # e has zeros everywhere except on one of its lower diagonal
+            e[Xn_indices[1]:last(Xn_indices), 1:lenXn] = lower_diag
+
+            cP = cX .- e
+            cP = cP[:, Xn_indices]
+            # orthogonalize against all Xn (including newly locked)
+            ortho!(cP, cX, cX, tol=ortho_tol)
+
+            # Get new P
+            new_P  = Y * cP
+            new_AP = AY * cP
+            if B != I
+                new_BP = BY * cP
+            else
+                new_BP = new_P
+            end
+        end
 
-- `C`: constraint to deflate the residual and solution vectors orthogonal
-    to a subspace; must overload `mul!`;
+        # Update all X, even newly locked
+        X  .= new_X
+        AX .= new_AX
+        if B != I
+            BX .= new_BX
+        end
 
-- `maxiter`: maximum number of iterations; default is 200;
+        # Quick sanity check
+        for i = 1:size(X, 2)
+            @views if abs(BX[:, i]'X[:, i] - 1) >= sqrt(eps(real(eltype(X))))
+                error("lobpcg is badly failing to keep the vectors normalized; this should never happen")
+            end
+        end
 
-- `tol::Real`: tolerance to which residual vector norms must be under.
+        # Restrict all views to active
+        @views begin
+            X = X[:, active]
+            AX = AX[:, active]
+            BX = BX[:, active]
+            R = new_R[:, active]
+            AR = AR[:, active]
+            BR = BR[:, active]
+            P = P[:, active]
+            AP = AP[:, active]
+            BP = BP[:, active]
+            λs = λs[active]
+        end
 
-# Output
+        # Update newly active P
+        if niter > 0
+            P .= new_P
+            AP .= new_AP
+            if B != I
+                BP .= new_BP
+            end
+        end
 
-- `results`: a `LOBPCGResults` struct. `r.λ` and `r.X` store the eigenvalues and eigenvectors.
-"""
-function lobpcg(A, largest::Bool, X0, nev::Int; kwargs...)
-    lobpcg(A, nothing, largest, X0, nev; kwargs...)
-end
-function lobpcg(A, B, largest::Bool, X0, nev::Int;
-                not_zeros=false, log=false, P=nothing, maxiter=200,
-                C=nothing, tol::Real=default_tolerance(eltype(X0)))
-    n = size(X0, 1)
-    sizeX = size(X0, 2)
-    nev > n && throw("Number of eigenvectors desired exceeds the row dimension.")
-    3*sizeX > n && throw("The LOBPCG algorithms is not stable to use when the matrix size is less than 3 times the block size. Please use a dense solver instead.")
-
-    sizeX = min(nev, sizeX)
-    X = X0[:, 1:sizeX]
-    iterator = LOBPCGIterator(A, B, largest, X, nev, P, C)
-
-    r = EmptyLOBPCGResults(X, nev, tol, maxiter)
-    rnext = lobpcg!(iterator, log=log, tol=tol, maxiter=maxiter, not_zeros=not_zeros)
-    append!(r, rnext, 0)
-    converged_x = sizeX
-    while converged_x < nev
-        @views if nev-converged_x < sizeX
-            cutoff = sizeX-(nev-converged_x)
-            update!(iterator.constr!, iterator.XBlocks.block[:, 1:cutoff], iterator.XBlocks.B_block[:, 1:cutoff])
-            X[:, 1:sizeX-cutoff] .= X[:, cutoff+1:sizeX]
-            rand!(X[:, cutoff+1:sizeX])
-            rnext = lobpcg!(iterator, log=log, tol=tol, maxiter=maxiter, not_zeros=true)
-            append!(r, rnext, converged_x, sizeX-cutoff)
-            converged_x += sizeX-cutoff
+        # Orthogonalize R wrt all X, newly active P
+        if niter > 0
+            Z  = LazyHcat(full_X, P)
+            BZ = LazyHcat(full_BX, BP)  # data shared with (full_X, P) in non-general case
         else
-            update!(iterator.constr!, iterator.XBlocks.block, iterator.XBlocks.B_block)
-            rand!(X)
-            rnext = lobpcg!(iterator, log=log, tol=tol, maxiter=maxiter, not_zeros=true)
-            append!(r, rnext, converged_x)
-            converged_x += sizeX
+            Z  = full_X
+            BZ = full_BX
+        end
+        ortho!(R, Z, BZ; tol=ortho_tol)
+        if B != I
+            mul!(BR, B, R)
+            B_ortho!(R, BR)
         end
+
+        niter < maxiter || break
+        niter = niter + 1
     end
-    return r
+
+    final_retval(full_X, full_AX, tol, resid_history, maxiter, maxiter, false,
+                largest, n_matvec)
 end