Cuda Copy Substructs - Цифровое Кемерово

Cuda Copy Substructs ⇐ C++

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 01 апр 2025, 07:39

У меня есть структура в CUDA, которая содержит другие структуры (субструктуры, а также эти субструкции имеют субструки) и указатели на динамически распределенную память. Я хочу скопировать всю структуру, включая ее субструктуры, от устройства до хоста, используя Cudamemcpy. Как я могу сделать это правильно, не вызывая мелких копий или проблем с памятью? Например, < /p>
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

std::cout
Функция ядра < /p>
__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= pop_num) return;

Network* net = &pop->Networks[idx];
net->num_neurons = input_num + output_num;
net->num_connections = input_num * output_num;
net->fitness = 0.0f;

curandState state;
curand_init(clock64(), idx, 0, &state);

cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);

for (int i = 0; i < output_num; ++i) {
net->Neurons.type = 2;
net->Neurons.bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
net->Neurons.output = 0.0f;
net->Neurons.input_sum = 0.0f;
}

for (int i = 0; i < input_num; ++i) {
net->Neurons.type = 0;
net->Neurons.bias = 0.0f;
net->Neurons.output = 0.0f;
net->Neurons.input_sum = 0.0f;

for (int j = 0; j < output_num; ++j) {
int offset = j + (output_num * i);
net->Connections[offset].from = i;
net->Connections[offset].to = j;
net->Connections[offset].innovationid = offset;
net->Connections[offset].enabled = true;
net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
}
}
}
< /code>
Распределение памяти и шаги Memcpy на хосте < /p>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}

int main() {
int population_size = 1024;
int input_num = 390;
int output_num = 3;

size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
input_num * output_num * sizeof(Connection));
size_t heap_size = heap_size_needed + heap_size_needed /5;
gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));

Population* h_population = (Population*)malloc(sizeof(Population));
if (!h_population) {
std::cerr generation_id = 1;

Population* d_population;
gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));

Network* d_networks;
gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));

gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));

int threadsPerBlock = 512;
int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;

CreateBasePopulation(d_population, population_size, input_num, output_num);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());

gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

for (int i = 0; i < population_size; i++) {
//Connection* d_connections;
//gpuErrchk(cudaMemcpy(&d_connections, &(d_networks.Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

int num_connections = h_networks.num_connections;
Connection* d_connections = nullptr;
gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

if (d_connections != nullptr) {
int num_connections = h_networks[i].num_connections;
Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);

gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

h_networks[i].Connections = h_connections;
} else {
std::cerr

Подробнее здесь: https://stackoverflow.com/questions/795 ... substructs

1743482388

Anonymous

 У меня есть структура в CUDA, которая содержит другие структуры (субструктуры, а также эти субструкции имеют субструки) и указатели на динамически распределенную память. Я хочу скопировать всю структуру, включая ее субструктуры, от устройства до хоста, используя Cudamemcpy. Как я могу сделать это правильно, не вызывая мелких копий или проблем с памятью? Например, < /p>
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

std::cout 
Функция ядра < /p>
__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= pop_num) return;

Network* net = &pop->Networks[idx];
net->num_neurons = input_num + output_num;
net->num_connections = input_num * output_num;
net->fitness = 0.0f;

curandState state;
curand_init(clock64(), idx, 0, &state);

cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);

for (int i = 0; i < output_num; ++i) {
net->Neurons[i].type = 2;
net->Neurons[i].bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;
}

for (int i = 0; i < input_num; ++i) {
net->Neurons[i].type = 0;
net->Neurons[i].bias = 0.0f;
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;

for (int j = 0; j < output_num; ++j) {
int offset = j + (output_num * i);
net->Connections[offset].from = i;
net->Connections[offset].to = j;
net->Connections[offset].innovationid = offset;
net->Connections[offset].enabled = true;
net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
}
}
}
< /code>
Распределение памяти и шаги Memcpy на хосте < /p>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}

int main() {
int population_size = 1024;
int input_num = 390;
int output_num = 3;

size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
input_num * output_num * sizeof(Connection));
size_t heap_size = heap_size_needed + heap_size_needed /5;
gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));

Population* h_population = (Population*)malloc(sizeof(Population));
if (!h_population) {
std::cerr generation_id = 1;

Population* d_population;
gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));

Network* d_networks;
gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));

gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));

int threadsPerBlock = 512;
int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;

CreateBasePopulation(d_population, population_size, input_num, output_num);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());

gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;

for (int i = 0; i < population_size; i++) {
//Connection* d_connections;
//gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

int num_connections = h_networks[i].num_connections;
Connection* d_connections = nullptr;
gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));

if (d_connections != nullptr) {
int num_connections = h_networks[i].num_connections;
Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);

gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));

h_networks[i].Connections = h_connections;
} else {
std::cerr 

Подробнее здесь: [url]https://stackoverflow.com/questions/79537359/cuda-copy-substructs[/url]