Additions:
barrier(CLK_GLOBAL_MEM_FENCE); // sincronizzazione tra i work item nello stesso work group: provare a toglielo per vedere se il risultato è ancora corretto
Deletions:
barrier(CLK_GLOBAL_MEM_FENCE); // sincronizzazione tra i work item nello stesso work group
Additions:
int l = get_local_id(0); // ottenibile anche come l = g % s
Deletions:
int l = get_local_id(0); // ottenibile anche come l = g % s
Additions:
int l = get_local_id(0); // ottenibile anche come l = g % s
Deletions:
int l = get_local_id(0); // ottenibile anche come l = g % s
Additions:
int l = get_local_id(0); // ottenibile anche come l = g % s
barrier(CLK_GLOBAL_MEM_FENCE); // sincronizzazione tra i work item nello stesso work group
Deletions:
int l = get_local_id(0);
barrier(CLK_GLOBAL_MEM_FENCE);
Additions:
Esempio reverse a gruppi visto in classe.
%%(c;reverseloc.c)
#include
#include "clut.h"
#define KERNEL_NAME "reverseloc"
#define LOCAL_SIZE 64
#define N LOCAL_SIZE*1000000
#define DEBUG 0
int main() {
int err; // error code
cl_kernel kernel; // execution kernel
cl_mem dv; // vector on device
cl_event evt; // performance measurement event
clut_device dev;
clut_open_device(&dev, "reverseloc.cl");
int *v = malloc(N*sizeof(int)), i, n = N;
if (!v) clut_panic("failed to allocate memory on host");
for (i=1; i<=n; ++i) v[i-1] = i;
// create the compute kernel
kernel = clCreateKernel(dev.program, KERNEL_NAME, &err);
clut_check_err(err, "failed to create kernel");
// allocate ivector on device as a copy of input matrix on host
dv = clCreateBuffer(dev.context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
n*sizeof(int), v, NULL);
if (!dv) clut_panic("failed to allocate input matrix on device memory");
// set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dv);
err |= clSetKernelArg(kernel, 1, sizeof(int), &n);
clut_check_err(err, "failed to set kernel arguments");
// execute the kernel over the range of our 2D input data set
size_t local_dim[] = { LOCAL_SIZE };
size_t global_dim[] = { n };
global_dim[0] = ((global_dim[0]+LOCAL_SIZE-1)/LOCAL_SIZE)*LOCAL_SIZE; // round up
err = clEnqueueNDRangeKernel(dev.queue, kernel, 1,
NULL, global_dim, local_dim, 0, NULL, &evt);
clut_check_err(err, "failed to execute kernel");
// copy result from device to host
err = clEnqueueReadBuffer(dev.queue, dv, CL_TRUE, 0,
n*sizeof(int), v, 0, NULL, NULL);
clut_check_err(err, "failed to read output result");
// return kernel execution time
double t = clut_get_duration(evt);
// print & check results
printf("Duration: %f\n", t);
for (i=0; i
#if DEBUG
printf("%d%s", v[i], (i+1)%LOCAL_SIZE == 0 ? "\n\n" : " ");
#endif
if (v[i+LOCAL_SIZE-2*(i%LOCAL_SIZE)-1] != i+1) break;
}
printf("%s Sayonara.\n", i==n ? "[OK]" : "[Error]");
// cleanup
clReleaseMemObject(dv);
clReleaseKernel(kernel);
free(v);
clut_close_device(&dev);
}
%%(c;reverseloc.cl)
__kernel void reverseloc(__global int* v, int n) {
int g = get_global_id(0);
int l = get_local_id(0);
int s = get_local_size(0);
if (g>=n) return;
int elem = v[g];
barrier(CLK_GLOBAL_MEM_FENCE);
v[g+s-2*l-1] = elem;
}
Deletions:
Esempio reverse a gruppi.
%%(c;)