Оптимизация этого цикла для достижения лучшей производительности в CUDA.

Оптимизация этого цикла для достижения лучшей производительности в CUDA. ⇐ C++

1 сообщение • Страница 1 из 1

Гость

Оптимизация этого цикла для достижения лучшей производительности в CUDA.

Цитата

Сообщение Гость » 09 мар 2024, 19:22

I would like to optimize this cycle because my performance are so bad. For each iteration I call the kernel that just separate nodes in two lists, the list that contain the nodes that have at least an edge pointing to the list of current leaves, and the list cointaining the other nodes, and i go ahead until I reach the root node. So I have so much allocation and deallocation but I don't know if it is the better way to do that (surely not):

Код: Выделить всё

    bool flag = true;
while(flag){

int counterNonLeaves = 0;

if(index == 0){
counterNonLeaves = (numNodes - allLen[index]);
blockSize = min(128, counterNonLeaves);
blockCount = (counterNonLeaves + blockSize - 1) / blockSize;
}
else{
counterNonLeaves = (allLen[index-1] - allLen[index]) - 1;
blockSize = min(128, counterNonLeaves);
blockCount = (counterNonLeaves + blockSize - 1) / blockSize;
}

// Local structures
Vertex* d_localNonLeaves;
Vertex* d_localLeaves;
Vertex* d_oldLeaves;
Vertex* d_oldNonLeaves;
int* lastLen;
cudaMalloc((void**)&d_localNonLeaves, (counterNonLeaves/2) * sizeof(Vertex));
cudaMalloc((void**)&d_localLeaves, ((counterNonLeaves/2)+1) * sizeof(Vertex));
cudaMalloc((void**)&d_oldLeaves, allLen[index] * sizeof(Vertex));
cudaMalloc((void**)&d_oldNonLeaves, (allLen[index]-1) * sizeof(Vertex));
cudaMalloc((void**)&lastLen, sizeof(int));
cudaMemset(lastLen, 0, 1 * sizeof(int));

// I take the current reference of "leaves" and "nonLeaves"
copyArrayHostToDevice(maxBis[index], d_oldLeaves, allLen[index]);
copyArrayHostToDevice(nonLeaves, d_oldNonLeaves, (allLen[index]-1));

index++;

maxBis = (Vertex**)realloc(maxBis, (index+1) * sizeof(Vertex*));
maxBis[index] = (Vertex*)malloc(((counterNonLeaves/2)+1) * sizeof(Vertex));
allLen = (int*)realloc(allLen, (index+1) * sizeof(int));
nonLeaves = (Vertex*)realloc(nonLeaves, (counterNonLeaves/2) * sizeof(Vertex));

// Second kernel
paige_tarjan_kernel(d_localNonLeaves, counterNonLeaves, d_localLeaves, d_oldLeaves, d_oldNonLeaves, allLen[index-1], lastLen);
cudaDeviceSynchronize();

// Copy back to the host
cudaMemcpy(&allLen[index], lastLen, sizeof(int), cudaMemcpyDeviceToHost);
copyArrayDeviceToHost(d_localLeaves, maxBis[index], allLen[index]);
copyArrayDeviceToHost(d_localNonLeaves, nonLeaves, (counterNonLeaves/2));

// Check to see if I arrived at the end of the cycle
if(allLen[index] == 1){
index++;
flag = false;
}

cudaFree(d_localNonLeaves);
cudaFree(d_localLeaves);
cudaFree(d_oldLeaves);
cudaFree(d_oldNonLeaves);
cudaFree(lastLen);
}

Assume that before this has been done a first preprocess operation that stored in

Код: Выделить всё

maxBis[0]

the starting leaves, and in

Код: Выделить всё

nonLeaves

the other nodes.

Источник: https://stackoverflow.com/questions/781 ... ce-in-cuda

1710001346

Гость


I would like to optimize this cycle because my performance are so bad. For each iteration I call the kernel that just separate nodes in two lists, the list that contain the nodes that have at least an edge pointing to the list of current leaves, and the list cointaining the other nodes, and i go ahead until I reach the root node. So I have so much allocation and deallocation but I don't know if it is the better way to do that (surely not):
[code]    bool flag = true;
while(flag){

int counterNonLeaves = 0;

if(index == 0){
counterNonLeaves = (numNodes - allLen[index]);
blockSize = min(128, counterNonLeaves);
blockCount = (counterNonLeaves + blockSize - 1) / blockSize;
}
else{
counterNonLeaves = (allLen[index-1] - allLen[index]) - 1;
blockSize = min(128, counterNonLeaves);
blockCount = (counterNonLeaves + blockSize - 1) / blockSize;
}

// Local structures
Vertex* d_localNonLeaves;
Vertex* d_localLeaves;
Vertex* d_oldLeaves;
Vertex* d_oldNonLeaves;
int* lastLen;
cudaMalloc((void**)&d_localNonLeaves, (counterNonLeaves/2) * sizeof(Vertex));
cudaMalloc((void**)&d_localLeaves, ((counterNonLeaves/2)+1) * sizeof(Vertex));
cudaMalloc((void**)&d_oldLeaves, allLen[index] * sizeof(Vertex));
cudaMalloc((void**)&d_oldNonLeaves, (allLen[index]-1) * sizeof(Vertex));
cudaMalloc((void**)&lastLen, sizeof(int));
cudaMemset(lastLen, 0, 1 * sizeof(int));

// I take the current reference of "leaves" and "nonLeaves"
copyArrayHostToDevice(maxBis[index], d_oldLeaves, allLen[index]);
copyArrayHostToDevice(nonLeaves, d_oldNonLeaves, (allLen[index]-1));

index++;

maxBis = (Vertex**)realloc(maxBis, (index+1) * sizeof(Vertex*));
maxBis[index] = (Vertex*)malloc(((counterNonLeaves/2)+1) * sizeof(Vertex));
allLen = (int*)realloc(allLen, (index+1) * sizeof(int));
nonLeaves = (Vertex*)realloc(nonLeaves, (counterNonLeaves/2) * sizeof(Vertex));

// Second kernel
paige_tarjan_kernel(d_localNonLeaves, counterNonLeaves, d_localLeaves, d_oldLeaves, d_oldNonLeaves, allLen[index-1], lastLen);
cudaDeviceSynchronize();

// Copy back to the host
cudaMemcpy(&allLen[index], lastLen, sizeof(int), cudaMemcpyDeviceToHost);
copyArrayDeviceToHost(d_localLeaves, maxBis[index], allLen[index]);
copyArrayDeviceToHost(d_localNonLeaves, nonLeaves, (counterNonLeaves/2));

// Check to see if I arrived at the end of the cycle
if(allLen[index] == 1){
index++;
flag = false;
}

cudaFree(d_localNonLeaves);
cudaFree(d_localLeaves);
cudaFree(d_oldLeaves);
cudaFree(d_oldNonLeaves);
cudaFree(lastLen);
}
[/code]
Assume that before this has been done a first preprocess operation that stored in [code]maxBis[0][/code] the starting leaves, and in [code]nonLeaves[/code] the other nodes.
 

Источник: [url]https://stackoverflow.com/questions/78133051/optimization-of-this-cycle-to-achieve-better-performance-in-cuda[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Как использовать `forkjoinpoo`l/` executors.newcachedthreadpool () `для достижения лучшей производительности?

Последнее сообщение Anonymous « 28 янв 2025, 14:14
Добавлено в форуме JAVA

Anonymous » 28 янв 2025, 14:14 » в форуме JAVA

Я столкнулся с проблемой в моем приложении, и искусственный пример будет выглядеть следующим образом:
fun main(args: Array) {
val start = System.currentTimeMillis()
Internal().doWork()
println( Duration is ${(System.currentTimeMillis() -...

0 Ответы

13 Просмотры

Последнее сообщение Anonymous
28 янв 2025, 14:14
Как использовать `forkjoinpool/` executors.newworkstealingpool () `для достижения лучшей производительности?

Последнее сообщение Anonymous « 28 янв 2025, 14:30
Добавлено в форуме JAVA

Anonymous » 28 янв 2025, 14:30 » в форуме JAVA

Я столкнулся с проблемой в моем приложении, и искусственный пример будет выглядеть следующим образом:
fun main(args: Array) {
val start = System.currentTimeMillis()
Internal().doWork()
println( Duration is ${(System.currentTimeMillis() -...

0 Ответы

12 Просмотры

Последнее сообщение Anonymous
28 янв 2025, 14:30
Как использовать `forkjoinpool/` executors.newworkstealingpool () `для достижения лучшей производительности?

Последнее сообщение Anonymous « 28 янв 2025, 18:57
Добавлено в форуме JAVA

Anonymous » 28 янв 2025, 18:57 » в форуме JAVA

Я столкнулся с проблемой в моем приложении, и искусственный пример будет выглядеть следующим образом:
fun main(args: Array) {
val start = System.currentTimeMillis()
Internal().doWork()
println( Duration is ${(System.currentTimeMillis() -...

0 Ответы

12 Просмотры

Последнее сообщение Anonymous
28 янв 2025, 18:57
Оптимизация вывода модели Florence-2 с помощью ONNXRuntime C++ — оптимизация производительности для цикла генерации

Последнее сообщение Anonymous « 23 дек 2024, 07:15
Добавлено в форуме C++

Anonymous » 23 дек 2024, 07:15 » в форуме C++

Я работаю над оптимизацией цикла вывода для модели Флоренции-2, используя ONNXRuntime на C++. Модель выполняет задачи, связанные с визуальным языком, и мне нужно оптимизировать цикл генерации, который обрабатывает часть модели, генерирующую язык....

0 Ответы

31 Просмотры

Последнее сообщение Anonymous
23 дек 2024, 07:15
Оптимизация вывода модели Florence-2 с помощью ONNXRuntime C++ - оптимизация производительности для цикла генерации [зак

Последнее сообщение Anonymous « 23 дек 2024, 14:29
Добавлено в форуме C++

Anonymous » 23 дек 2024, 14:29 » в форуме C++

Я работаю над оптимизацией цикла вывода для модели Флоренции-2, используя ONNXRuntime на C++. Модель выполняет задачи, связанные с визуальным языком, и мне нужно оптимизировать цикл генерации, который обрабатывает часть модели, генерирующую язык....

0 Ответы

39 Просмотры

Последнее сообщение Anonymous
23 дек 2024, 14:29

Вернуться в «C++»