# L-20 MCS 572 Fri 11 Oct 2024 : dotproj.jl

# Illustration using shared memory with CUDA.jl,
# copied from the documentation.

using CUDA

"""
    function dot(a,b,c, N, threadsPerBlock, blocksPerGrid)

computes the dot product of two vectors a and b of length N
and places the result in c, using shared memory.
"""
function dot(a,b,c, N, threadsPerBlock, blocksPerGrid)

    # Set up shared memory cache for this current block.
    cache = @cuDynamicSharedMem(Int64, threadsPerBlock)

    # Initialise some variables.
    tid = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
    totalThreads = blockDim().x * gridDim().x
    cacheIndex = threadIdx().x - 1
    temp = 0

    # Iterate over vector to do dot product in parallel way
    while tid < N
        temp += a[tid + 1] * b[tid + 1]
        tid += totalThreads
    end

    # set cache values
    cache[cacheIndex + 1] = temp

    # synchronise threads
    sync_threads()

    # In the step below, we add up all of the values stored in the cache
    i::Int = blockDim().x ÷ 2
    while i!=0
        if cacheIndex < i
            cache[cacheIndex + 1] += cache[cacheIndex + i + 1]
        end
        sync_threads()
        i = i ÷ 2
    end

    # cache[1] now contains the sum of vector dot product calculations done in
    # this block, so we write it to c
    if cacheIndex == 0
        c[blockIdx().x] = cache[1]
    end

    return nothing
end

function sum_squares(x)
    return (x * (x + 1) * (2 * x + 1) / 6)
end

function main()

    # Initialise variables
    N::Int64 = 33 * 1024
    threadsPerBlock::Int64 = 256
    blocksPerGrid::Int64 = min(32, (N + threadsPerBlock - 1) / threadsPerBlock)

    # input arrays on the host
    a_h = [i for i=1:N]
    b_h = [2*i for i=1:N]

    # Create a,b and c
    a = CuArray(a_h)
    b = CuArray(b_h)
    c = CUDA.CuArray(fill(0, blocksPerGrid))

    # Execute the kernel. Note the shmem argument - this is necessary to allocate
    # space for the cache we allocate on the gpu with @cuDynamicSharedMem
    @cuda blocks = blocksPerGrid threads = threadsPerBlock shmem =
    (threadsPerBlock * sizeof(Int64)) dot(a,b,c, N, threadsPerBlock, blocksPerGrid)

    # Copy c back from the gpu (device) to the host
    c = Array(c)

    local result = 0

    # Sum the values in c
    for i in 1:blocksPerGrid
        result += c[i]
    end

    # Check whether output is correct
    println("Does GPU value ", result, " = ", 2 * sum_squares(N - 1))
end

main()