Giovedì 20 dicembre 2017
Esempio reverse a gruppi visto in classe.
#include <stdio.h>
#include "clut.h"
#define KERNEL_NAME "reverseloc"
#define LOCAL_SIZE 64
#define N LOCAL_SIZE*1000000
#define DEBUG 0
int main
() {
int err;
// error code
cl_kernel kernel;
// execution kernel
cl_mem dv;
// vector on device
cl_event evt;
// performance measurement event
clut_device dev;
clut_open_device
(&dev,
"reverseloc.cl");
int *v = malloc
(N*
sizeof(int)), i, n = N;
if (!v
) clut_panic
("failed to allocate memory on host");
for (i=
1; i<=n; ++i
) v
[i
-1] = i;
// create the compute kernel
kernel = clCreateKernel
(dev.
program, KERNEL_NAME, &err
);
clut_check_err
(err,
"failed to create kernel");
// allocate ivector on device as a copy of input matrix on host
dv = clCreateBuffer
(dev.
context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
n*
sizeof(int), v,
NULL);
if (!dv
) clut_panic
("failed to allocate input matrix on device memory");
// set the arguments to our compute kernel
err = clSetKernelArg
(kernel,
0,
sizeof(cl_mem
), &dv
);
err |= clSetKernelArg
(kernel,
1,
sizeof(int), &n
);
clut_check_err
(err,
"failed to set kernel arguments");
// execute the kernel over the range of our 2D input data set
size_t local_dim
[] =
{ LOCAL_SIZE
};
size_t global_dim
[] =
{ n
};
global_dim
[0] =
((global_dim
[0]+LOCAL_SIZE
-1)/LOCAL_SIZE
)*LOCAL_SIZE;
// round up
err = clEnqueueNDRangeKernel
(dev.
queue, kernel,
1,
NULL, global_dim, local_dim,
0,
NULL, &evt
);
clut_check_err
(err,
"failed to execute kernel");
// copy result from device to host
err = clEnqueueReadBuffer
(dev.
queue, dv, CL_TRUE,
0,
n*
sizeof(int), v,
0,
NULL,
NULL);
clut_check_err
(err,
"failed to read output result");
// return kernel execution time
double t = clut_get_duration
(evt
);
// print & check results
printf("Duration: %f\n", t
);
for (i=
0; i<n; ++i
) {
#if DEBUG
printf("%d%s", v
[i
],
(i
+1)%LOCAL_SIZE ==
0 ?
"\n\n" :
" ");
#endif
if (v
[i+LOCAL_SIZE
-2*
(i%LOCAL_SIZE
)-1] != i
+1) break;
}
printf("%s Sayonara.\n", i==n ?
"[OK]" :
"[Error]");
// cleanup
clReleaseMemObject
(dv
);
clReleaseKernel
(kernel
);
free
(v
);
clut_close_device
(&dev
);
}
__kernel void reverseloc(__global int* v, int n) {
int g = get_global_id(0);
int s = get_local_size(0);
int l = get_local_id(0); // ottenibile anche come l = g % s
if (g>=n) return;
int elem = v[g];
barrier(CLK_GLOBAL_MEM_FENCE); // sincronizzazione tra i work item nello stesso work group: provare a toglielo per vedere se il risultato è ancora corretto
v[g+s-2*l-1] = elem;
}