Код: Выделить всё
__global__ void kernel_a(int* result, int* array, int* parents, int parents_length, int result_length, int* partial) {
extern __shared__ int shared[];
int idx = threadIdx.x;
int thread_id = blockIdx.x * blockDim.x + idx;
if (thread_id < parents_length) {
shared[idx] = array[thread_id];
} else {
shared[idx] = 0;
}
__syncthreads();
for (int stride = 1; stride < blockDim.x; stride *= 2) {
int val = 0;
if (idx >= stride && thread_id < parents_length && parents[thread_id] == parents[thread_id - stride]) {
val = shared[idx - stride];
}
shared[idx] += val;
__syncthreads();
}
if (thread_id < parents_length) {
int parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == parents_length - 1 || parents[thread_id] != parents[thread_id + 1]) {
partial[blockIdx.x * result_length + parent] = shared[idx];
}
}
}
// for handling block boundary carry over
__global__ void kernel_b(int* result, int* array, int* parents, int parents_length, int result_length, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < result_length) {
int sum = 0;
int blocks = (parents_length + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < blocks; ++i) {
sum += partial[i * result_length + thread_id];
}
result[thread_id] = sum;
}
}
Подробнее здесь: https://stackoverflow.com/questions/786 ... um-in-cuda