# L-29 MCS 572 Fri 1 Nov 2024 : mtjacobi.jl

# A multithreaded version of the method of Jacobi.
# The dimension must be a multiple of the number of threads.
# Call as julia -t 4 mtjacobi 1000
# to run on a problem of dimension 1000 with 4 threads.

using LinearAlgebra
using Base.Threads
using SyncBarriers
using Printf

"""
    function jacobi(A::Matrix{Float64},
                    b::Vector{Float64}, x0::Vector{Float64},
                    N::Int, tol::Float64, verbose=true)

does at most N iterations of Jacobi matrix, to solve A*x = b,
starting at the vector x0.  Stops when the norm of the update
vector is less than the tolerance tol.
If verbose, then the norm of each update is written to screen.
"""
function jacobi(A::Matrix{Float64},
                b::Vector{Float64}, x0::Vector{Float64},
                N::Int, tol::Float64, verbose=true)
    nt = nthreads()
    size = convert(Int, length(b)/nt) # size of each strip
    nit = N
    x = deepcopy(x0)
    D = Diagonal(A)
    ndx = zeros(nt)
    barrier = reduce_barrier(+, Float64, nt)
    @threads for i=1:nt
        tdx = threadid()
        idxstart = 1 + (tdx-1)*size
        idxend = tdx*size
        mystrip = A[idxstart:idxend, :]
        mydiag = D[idxstart:idxend, idxstart:idxend]
        myrhs = b[idxstart:idxend]
        for k=1:N
            if tdx == 1
                nit = k
            end
            @inbounds dx = mydiag^(-1)*(myrhs - mystrip*x)
            @inbounds x[idxstart:idxend] = x[idxstart:idxend] + dx
            ndx[tdx] = norm(dx)
            normdx = reduce!(barrier[tdx], ndx[tdx])
            if verbose
                if tdx == 1
                    sdx = @sprintf("%.2e", normdx)
                    println(sdx)
                end
            end
            if normdx < tol
               break
            end
        end
    end
    return (nit, x)
end

"""
    function main()

tests Jacobi's method on a diagonally dominant matrix.
The dimension is taken from the command line arguments if provided.
The dimension must be a multiple of the number of threads.
"""
function main()
    nt = nthreads()
    if length(ARGS) > 0
        dim = parse(Int, ARGS[1])
    else
        dim = 10*nt
    end
    I = ones(dim, dim)
    A = I + dim*Diagonal(I)
    b = 2*dim*ones(dim)
    x0 = zeros(dim)
    N = dim*dim
    tol = 1.0e-4
    n, z = jacobi(A, b, x0, N, tol, false)
    println("number of iterations : ", n)
    println("the error : ", norm(z - ones(dim)))
end

main()