# L-22 MCS 572 Wed 16 Oct 2024 : gpusum32metal.jl # Summing 32 numbers with 32 threads, # using loop enrolling, with zero padding. using Metal """ function gpusum32!(a) sums the 32 numbers in the array a. On return a[1] contains the sum. """ function gpusum32!(a) i = thread_position_in_grid_1d() a[i] += a[i+1] a[i] += a[i+2] a[i] += a[i+4] a[i] += a[i+8] a[i] += a[i+16] return nothing end a_h = [convert(Float32, k) for k=1:32] z_h = [0.0f0 for k=1:32] # padding with zeros x_h = vcat(a_h, z_h) println("the numbers to sum : ", x_h) x_d = MtlArray(x_h) # run with 32 threads @metal threads=32 gpusum32!(x_d) println("the summed numbers : ", Array(x_d))