# L-22 MCS 572 Wed 16 Oct 2024 : gpusum32cuda.jl # Summing 32 numbers with 32 threads, # using loop enrolling, with a different order. using CUDA """ function gpusum32!(a) sums the 32 numbers in the array a. On return a[1] contains the sum. """ function gpusum32!(a) i = threadIdx().x a[i] += a[i+16] a[i] += a[i+8] a[i] += a[i+4] a[i] += a[i+2] a[i] += a[i+1] return nothing end a_h = [convert(Float32, k) for k=1:32] z_h = [0.0f0 for k=1:32] # padding with zeros x_h = vcat(a_h, z_h) println("the numbers to sum : ", x_h) x_d = CuArray(x_h) # run with 32 threads @cuda threads=32 gpusum32!(x_d) println("the summed numbers : ", Array(x_d))