У меня есть структура в CUDA, которая содержит другие структуры (субструктуры, а также эти субструкции имеют субструки) и указатели на динамически распределенную память. Я хочу скопировать всю структуру, включая ее субструктуры, от устройства до хоста, используя Cudamemcpy. Как я могу сделать это правильно, не вызывая мелких копий или проблем с памятью? Например, < /p>
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;
std::cout
Функция ядра < /p>
__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= pop_num) return;
Network* net = &pop->Networks[idx];
net->num_neurons = input_num + output_num;
net->num_connections = input_num * output_num;
net->fitness = 0.0f;
curandState state;
curand_init(clock64(), idx, 0, &state);
cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);
for (int i = 0; i < output_num; ++i) {
net->Neurons.type = 2;
net->Neurons.bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
net->Neurons.output = 0.0f;
net->Neurons.input_sum = 0.0f;
}
for (int i = 0; i < input_num; ++i) {
net->Neurons.type = 0;
net->Neurons.bias = 0.0f;
net->Neurons.output = 0.0f;
net->Neurons.input_sum = 0.0f;
for (int j = 0; j < output_num; ++j) {
int offset = j + (output_num * i);
net->Connections[offset].from = i;
net->Connections[offset].to = j;
net->Connections[offset].innovationid = offset;
net->Connections[offset].enabled = true;
net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
}
}
}
< /code>
Распределение памяти и шаги Memcpy на хосте < /p>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
int main() {
int population_size = 1024;
int input_num = 390;
int output_num = 3;
size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
input_num * output_num * sizeof(Connection));
size_t heap_size = heap_size_needed + heap_size_needed /5;
gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));
Population* h_population = (Population*)malloc(sizeof(Population));
if (!h_population) {
std::cerr generation_id = 1;
Population* d_population;
gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));
Network* d_networks;
gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));
gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));
int threadsPerBlock = 512;
int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;
CreateBasePopulation(d_population, population_size, input_num, output_num);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;
for (int i = 0; i < population_size; i++) {
//Connection* d_connections;
//gpuErrchk(cudaMemcpy(&d_connections, &(d_networks.Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));
int num_connections = h_networks.num_connections;
Connection* d_connections = nullptr;
gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));
if (d_connections != nullptr) {
int num_connections = h_networks[i].num_connections;
Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);
gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));
h_networks[i].Connections = h_connections;
} else {
std::cerr
Подробнее здесь: https://stackoverflow.com/questions/795 ... substructs
Cuda Copy Substructs ⇐ C++
Программы на C++. Форум разработчиков
1743482388
Anonymous
У меня есть структура в CUDA, которая содержит другие структуры (субструктуры, а также эти субструкции имеют субструки) и указатели на динамически распределенную память. Я хочу скопировать всю структуру, включая ее субструктуры, от устройства до хоста, используя Cudamemcpy. Как я могу сделать это правильно, не вызывая мелких копий или проблем с памятью? Например, < /p>
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size,cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;
std::cout
Функция ядра < /p>
__global__ void CreateBasePopulation(Population* pop, int pop_num, int input_num, int output_num) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= pop_num) return;
Network* net = &pop->Networks[idx];
net->num_neurons = input_num + output_num;
net->num_connections = input_num * output_num;
net->fitness = 0.0f;
curandState state;
curand_init(clock64(), idx, 0, &state);
cudaMalloc(&(net->Neurons), sizeof(Neuron) * net->num_neurons);
cudaMalloc(&(net->Connections), sizeof(Connection) * net->num_connections);
for (int i = 0; i < output_num; ++i) {
net->Neurons[i].type = 2;
net->Neurons[i].bias = ((2.0f * sqrtf((float)input_num) * curand_uniform(&state)) - sqrtf((float)input_num)) / output_num;
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;
}
for (int i = 0; i < input_num; ++i) {
net->Neurons[i].type = 0;
net->Neurons[i].bias = 0.0f;
net->Neurons[i].output = 0.0f;
net->Neurons[i].input_sum = 0.0f;
for (int j = 0; j < output_num; ++j) {
int offset = j + (output_num * i);
net->Connections[offset].from = i;
net->Connections[offset].to = j;
net->Connections[offset].innovationid = offset;
net->Connections[offset].enabled = true;
net->Connections[offset].weight = (2.0f * curand_uniform(&state)) - 1.0f;
}
}
}
< /code>
Распределение памяти и шаги Memcpy на хосте < /p>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort)
exit(code);
}
}
int main() {
int population_size = 1024;
int input_num = 390;
int output_num = 3;
size_t heap_size_needed = population_size * ((input_num + output_num) * sizeof(Neuron) +
input_num * output_num * sizeof(Connection));
size_t heap_size = heap_size_needed + heap_size_needed /5;
gpuErrchk(cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size));
Population* h_population = (Population*)malloc(sizeof(Population));
if (!h_population) {
std::cerr generation_id = 1;
Population* d_population;
gpuErrchk(cudaMalloc(&d_population, sizeof(Population)));
Network* d_networks;
gpuErrchk(cudaMalloc(&d_networks, sizeof(Network) * population_size));
gpuErrchk(cudaMemcpy(&(h_population->Networks), &d_networks, sizeof(Network*), cudaMemcpyHostToHost));
gpuErrchk(cudaMemcpy(d_population, h_population, sizeof(Population), cudaMemcpyHostToDevice));
int threadsPerBlock = 512;
int blocks = (population_size + threadsPerBlock - 1) / threadsPerBlock;
CreateBasePopulation(d_population, population_size, input_num, output_num);
gpuErrchk(cudaGetLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_population, d_population, sizeof(Population), cudaMemcpyDeviceToHost));
Network* h_networks = (Network*)malloc(sizeof(Network) * population_size);
gpuErrchk(cudaMemcpy(h_networks, d_networks, sizeof(Network) * population_size, cudaMemcpyDeviceToHost));
h_population->Networks = h_networks;
for (int i = 0; i < population_size; i++) {
//Connection* d_connections;
//gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));
int num_connections = h_networks[i].num_connections;
Connection* d_connections = nullptr;
gpuErrchk(cudaMemcpy(&d_connections, &(d_networks[i].Connections), sizeof(Connection*), cudaMemcpyDeviceToHost));
if (d_connections != nullptr) {
int num_connections = h_networks[i].num_connections;
Connection* h_connections = (Connection*)malloc(sizeof(Connection) * num_connections);
gpuErrchk(cudaMemcpy(h_connections, d_connections, sizeof(Connection) * num_connections, cudaMemcpyDeviceToHost));
h_networks[i].Connections = h_connections;
} else {
std::cerr
Подробнее здесь: [url]https://stackoverflow.com/questions/79537359/cuda-copy-substructs[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия