# L-20 MCS 572 Fri 11 Oct 2024 : dotproj.jl # Illustration using shared memory with CUDA.jl, # copied from the documentation. using CUDA """ function dot(a,b,c, N, threadsPerBlock, blocksPerGrid) computes the dot product of two vectors a and b of length N and places the result in c, using shared memory. """ function dot(a,b,c, N, threadsPerBlock, blocksPerGrid) # Set up shared memory cache for this current block. cache = @cuDynamicSharedMem(Int64, threadsPerBlock) # Initialise some variables. tid = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x totalThreads = blockDim().x * gridDim().x cacheIndex = threadIdx().x - 1 temp = 0 # Iterate over vector to do dot product in parallel way while tid < N temp += a[tid + 1] * b[tid + 1] tid += totalThreads end # set cache values cache[cacheIndex + 1] = temp # synchronise threads sync_threads() # In the step below, we add up all of the values stored in the cache i::Int = blockDim().x ÷ 2 while i!=0 if cacheIndex < i cache[cacheIndex + 1] += cache[cacheIndex + i + 1] end sync_threads() i = i ÷ 2 end # cache[1] now contains the sum of vector dot product calculations done in # this block, so we write it to c if cacheIndex == 0 c[blockIdx().x] = cache[1] end return nothing end function sum_squares(x) return (x * (x + 1) * (2 * x + 1) / 6) end function main() # Initialise variables N::Int64 = 33 * 1024 threadsPerBlock::Int64 = 256 blocksPerGrid::Int64 = min(32, (N + threadsPerBlock - 1) / threadsPerBlock) # input arrays on the host a_h = [i for i=1:N] b_h = [2*i for i=1:N] # Create a,b and c a = CuArray(a_h) b = CuArray(b_h) c = CUDA.CuArray(fill(0, blocksPerGrid)) # Execute the kernel. Note the shmem argument - this is necessary to allocate # space for the cache we allocate on the gpu with @cuDynamicSharedMem @cuda blocks = blocksPerGrid threads = threadsPerBlock shmem = (threadsPerBlock * sizeof(Int64)) dot(a,b,c, N, threadsPerBlock, blocksPerGrid) # Copy c back from the gpu (device) to the host c = Array(c) local result = 0 # Sum the values in c for i in 1:blocksPerGrid result += c[i] end # Check whether output is correct println("Does GPU value ", result, " = ", 2 * sum_squares(N - 1)) end main()