I am trying to learn PyOpenCl.
I am following various tutorials/examples that I found online, and I have been trying to put together a simple matrix multiplication.
I fail to see why I cannot get the right results: it seems to me that the for cycle in my kernel is not being executed (the output C_flat is always zeros) or maybe I am managing in the wrong way some memory.
Can anyone give me any suggestions?
Thanks a lot!
Here is the code:
import numpy as np
import pyopencl as cl
import time
def create_input_memory(context, input_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.READ_ONLY, size=array.nbytes))
for array in input_arrays]
def create_output_memory(context, output_arrays):
return [(array, cl.Buffer(context, flags=cl.mem_flags.WRITE_ONLY, size=array.nbytes))
for array in output_arrays]
def matrix_multiply_gpu(A, B):
A_height, A_width = A.shape[0], A.shape[1]
B_height, B_width = B.shape[0], B.shape[1]
C = np.zeros((A_height, B_width))
A_flat = A.flatten()
B_flat = B.flatten()
C_flat = C.flatten()
print(C_flat)
kernel_source = """
kernel void mul(int Wa, int Ha, int Wb, int Hb,
global float *input_a,
global float *input_b,
global float *result){
/* ROW MAJOR notation (I imagine the "GPU matrix") --> no, just model*/
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0.0f;
for (int i = 0; i < Wa; i++){
sum += input_a[row * Wa + i] * input_b[i * Wb + col];
}
result[row * Wb + col] = sum;
}
"""
platforms = cl.get_platforms()
context = cl.Context(dev_type=cl.device_type.GPU,
properties=[(cl.context_properties.PLATFORM, platforms[0])])
gpu_program_source = cl.Program(context, kernel_source)
gpu_program = gpu_program_source.build()
input_tuples = create_input_memory(context,
(A_flat, B_flat))
output_tuples = create_output_memory(context, (C_flat,))
gpu_queue = cl.CommandQueue(context)
kernel_arguments = [buffer for (_,buffer) in input_tuples]
kernel_arguments += [buffer for (_,buffer) in output_tuples]
gpu_program.mul(gpu_queue, (1024,), (32,),
np.int32(A_height), np.int32(A_width), np.int32(B_height),
np.int32(B_width), *kernel_arguments)
for (array, buffer) in output_tuples:
cl.enqueue_copy(gpu_queue, src=buffer, dest=array)
#wait for everyone to finish
gpu_queue.finish()
return C_flat
if __name__=='__main__':
A, B = np.ones((100, 100)), np.ones((100, 100))
C = matrix_multiply_gpu(A, B)
print("
", C, "
")
question from:
https://stackoverflow.com/questions/65847754/pyopencl-simple-matrix-multiplication