I have to program the Floyd algorithm using OpenCL, it works fine but only with n<268. when n>=268 i have an "Access violation reading location" when calling clEnqueueWriteBuffer (the buffer_distances one, in the loop).
Here is my code:
graphe is an adjacency matrix, and distances is the distances matrix
int n;
printf("enter n value: ");
scanf("%d", &n);
printf("\n");
int n2 = n * n;
int matSize = n2 * sizeof(int*);
int* graphe = malloc(sizeof(int) * n2);
int* distances = malloc(sizeof(int) * n2);
//mat[i,j] => mat[i*n j]
if (graphe == NULL)
printf("malloc failed\n");
init_graphe(graphe, n);
copy(graphe, distances, n);
initialization of opencl variables:
char* programSource = load_kernel("kernel.cl");
cl_int status;
// STEP 1: Discover and initialize the platforms
cl_uint numPlatforms = 0;
cl_platform_id* platforms = NULL;
status = clGetPlatformIDs(0, NULL, &numPlatforms);
printf("Number of platforms = %d\n", numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
char Name[1000];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, sizeof(Name), Name, NULL);
printf("Name of platform : %s\n", Name);
fflush(stdout);
// STEP 2: Discover and initialize the devices
cl_uint numDevices = 0;
cl_device_id* devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
printf("Number of devices = %d\n", (int)numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
for (int i = 0; i < numDevices; i ) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(Name), Name, NULL);
printf("Name of device %d: %s\n\n", i, Name);
}
// STEP 3: Create a context
fflush(stdout);
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
// STEP 4: Create a command queue
fflush(stdout);
cl_command_queue cmdQueue;
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
// STEP 5: Create device buffers
fflush(stdout);
cl_mem buffer_graphe;
cl_mem buffer_n;
cl_mem buffer_distances;
cl_mem buffer_k;
buffer_graphe = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_n = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
buffer_distances = clCreateBuffer(context, CL_MEM_READ_WRITE, matSize, NULL, &status);
buffer_k = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &status);
fflush(stdout);
// STEP 6: Create and compile the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status);
printf("Compilation\n");
fflush(stdout);
status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// STEP 8: Create the kernel
cl_kernel kernel = NULL;
fflush(stdout);
kernel = clCreateKernel(program, "floyd", &status);
size_t globalWorkSize[2] = { n, n };
size_t localWorkSize[3] = { 20,20 };
Execution of the kernel:
clock_t start = clock();
int k;
for (k = 0; k < n; k ) {
status = clEnqueueWriteBuffer(cmdQueue, buffer_graphe, CL_TRUE, 0, matSize, graphe, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_n, CL_TRUE, 0, sizeof(int), &n, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
status = clEnqueueWriteBuffer(cmdQueue, buffer_k, CL_TRUE, 0, sizeof(int), &k, 0, NULL, NULL);
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer_graphe);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buffer_n);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buffer_distances);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buffer_k);
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
clFinish(cmdQueue);
status = clEnqueueReadBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
clFinish(cmdQueue);
}
and the kernel:
void kernel floyd(global int* graphe, global int* n, global int* distances, global int* k)
{
int i = get_global_id(0);
int j = get_global_id(1);
int ij = i * (*n) j;
int ik = i * (*n) (*k);
int kj = (*k) * (*n) j;
if (distances[ik] distances[kj] < distances[ij]) {
distances[ij] = distances[ik] distances[kj];
}
}
CodePudding user response:
You have:
int matSize = n2 * sizeof(int*);
…
int* distances = malloc(sizeof(int) * n2);
…
status = clEnqueueWriteBuffer(cmdQueue, buffer_distances, CL_TRUE, 0, matSize, distances, 0, NULL, NULL);
- Say n2 is 100.
matSizewill be 800 on a 64-bit system. (sizeof(int*)= 8)- You allocate 400 bytes of memory for your
distancesarray. (sizeof(int)= 4, typically) - You then copy 800 bytes (
matSize) fromdistancesinto your OpenCL buffer. This overflows the end of the array. Whoops.
The bug is of course the use of sizeof(int*): you've got an array of ints, not an array of pointers, so this should be sizeof(int), which is what you're correctly doing in the malloc call. (I can't quite fathom why you're not using matSize there.) Although what you should probably be using is CLint, or one of the explicitly-sized types (int32_t in this case), because types in OpenCL kernels have very specific definitions which may or may not match those in host C code.
Additional Notes:
- I'm not 100% convinced your data dependencies are safe here. No work-item should be reading an array entry that another is writing in the same kernel enqueueing batch. It seems to me that
ij(written) for one of the work-items will be equal toik(read) for the others in the row? Similar deal withijandkj. - There's no need to read and re-write the
distancesbuffer between iterations, if you're not modifying it on the host. Neither doesgrapheneed re-writing every time if it's not changing. - You can pass scalar arguments such as
kandnwithout a buffer.status = clSetKernelArg(kernel, 1, sizeof(n), &n);works fine if you change your kernel signature's argument toint n. (no dereference needed in the kernel then.) - A local work size of 20x20 is likely not optimal. If you're not using local memory or barriers, don't bother setting a local size at al.
- You should be able to remove the
clFinishcalls, and you can change the buffer writes to be non-blocking once you've moved them outside the loop. This might give you an additional small speedup.
CodePudding user response:
I think i found the solution, i replaced malloc by calloc, and now it works.
