Почему оптимизация рендеринга OpenCL замедлила мой код? - Цифровое Кемерово

Почему оптимизация рендеринга OpenCL замедлила мой код? ⇐ C++

Ответить

1 сообщение • Страница 1 из 1

Гость

Почему оптимизация рендеринга OpenCL замедлила мой код?

Цитата

Сообщение Гость » 09 мар 2024, 17:37

I'm trying to optimize my 3D rendering OpenCL code. Currently, I render a 3D triangle by dividing its bounding box into 2x2 tiles and doing necessary computations for each pixel inside the tile. This turned out to be slightly faster than giving each work item just a single pixel, because based on the barycentric coordinates of point (x,y), it's easy to update them and get the coordinates for (x+1,y) etc. Here's the implementation of this idea:

Код: Выделить всё

typedef float3 vec3;

// v1,v2,v3 are (x,y,1/z) coordinates of the triangle's projection onto the screen
__kernel void draw(__global float* depthBuffer,__global int* colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {

// do all the computations for a 2x2px tile.
// inv is the precomputed denominator in the barycentric coords formula:
// 1/(v1.x*v2.y - v1.x*v3.y - v1.y*v2.x + v1.y*v3.x + v2.x*v3.y - v2.y*v3.x)

// x,y of top-left corner of tile:
int x = 2*(get_global_id(0))+minX; // minX,minY - top left of triangle bounding box
int y = 2*(get_global_id(1))+minY; // 2 is the tile size

float lambda1 = (-v1.x*v3.y + v1.x*y + v1.y*v3.x - v1.y*x - v3.x*y + v3.y*x)*inv; // barycentric coordinates of the top-left corner of the tile.
float lambda2 = (v1.x*v2.y - v1.x*y - v1.y*v2.x + v1.y*x + v2.x*y - v2.y*x)*inv;
float dx1 = (v3.y - v1.y)*inv, dx2 = (v1.y - v2.y)*inv; // barycentric coordinates of a point increase by this value when x increases by 1.
int index = (pos.y + (screen_height / 2)) * screen_width + (pos.x + (screen_width / 2));
// test if top-left pixel is inside triangle's projection
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

++index; // update pixel index
lambda1 += dx1; // get barycentric coordinates for point (x+1,y)
lambda2 += dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

index+=screen_width;
lambda1 += (v1.x - v3.x)*inv; // same as dx1,dx2 but for changing y.
lambda2 += (v2.x - v1.x)*inv; // (x+1,y+1)
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

--index;
lambda1 -= dx1; // (x,y+1)
lambda2 -= dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
}

It tried to make it even faster, so I did something similar but for bigger tiles (4x4,...,8x8) but the results were very underwhelming. My idea was to check if any of the tile's corners lies inside the triangle's projection. If not, the tile is skipped and no computations are done for the remaining pixels inside it. Otherwise, I do computations for all the pixels inside the tile, in the following order:

Here's the code:

Код: Выделить всё

__kernel void draw(__global float * depthBuffer, __global int * colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {
int tileW = 4, tileH = 4;
int x = tileW * (get_global_id(0)) + minX;
int y = tileH * (get_global_id(1)) + minY;

float lambda1 = (-v1.x * v3.y + v1.x * y + v1.y * v3.x - v1.y * x - v3.x * y + v3.y * x) * inv;
float lambda2 = (v1.x * v2.y - v1.x * y - v1.y * v2.x + v1.y * x + v2.x * y - v2.y * x) * inv;
// the coords of top-left pixel will be useful if the tile overlaps the triangle's projection
float old1 = lambda1, old2 = lambda2;
float dx1 = (v3.y - v1.y) * inv, dx2 = (v1.y - v2.y) * inv;
float dy1 = (v1.x - v3.x) * inv, dy2 = (v2.x - v1.x) * inv; // how barycentric coords change

if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2  depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
if ((!level && tx == tileW - 1) || (level && tx == 0)) {
++ty;
index += screen_width;
lambda1 += dy1;
lambda2 += dy2;
level ^= 1;
} else if (!level) {
++tx;
++index;
lambda1 += dx1;
lambda2 += dx2;
} else {
--tx;
--index;
lambda1 -= dx1;
lambda2 -= dx2;
}
}
}

The performance was at best slightly slower than the previous approach, and only when I set tile size to 2x2 (which defeats the purpose). For bigger tile sizes it just was getting worse and worse.
I don't understand these results. Fewer pixels that lie outside the triangle are tested and I use the full formula for barycentric coordinates for fewer pixels in the second part of the code (Only for the top-left pixel inside a tile).
Can someone explain why this is happening and how to further optimize my code? I'm running this on Ubuntu with the integrated GPU that comes with i5-1240p.

Источник: https://stackoverflow.com/questions/781 ... ode-slower

1709995054

Гость


I'm trying to optimize my 3D rendering OpenCL code. Currently, I render a 3D triangle by dividing its bounding box into 2x2 tiles and doing necessary computations for each pixel inside the tile. This turned out to be slightly faster than giving each work item just a single pixel, because based on the barycentric coordinates of point (x,y), it's easy to update them and get the coordinates for (x+1,y) etc. Here's the implementation of this idea:
[code]typedef float3 vec3;

// v1,v2,v3 are (x,y,1/z) coordinates of the triangle's projection onto the screen
__kernel void draw(__global float* depthBuffer,__global int* colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {

// do all the computations for a 2x2px tile.
// inv is the precomputed denominator in the barycentric coords formula:
// 1/(v1.x*v2.y - v1.x*v3.y - v1.y*v2.x + v1.y*v3.x + v2.x*v3.y - v2.y*v3.x)

// x,y of top-left corner of tile:
int x = 2*(get_global_id(0))+minX; // minX,minY - top left of triangle bounding box
int y = 2*(get_global_id(1))+minY; // 2 is the tile size

float lambda1 = (-v1.x*v3.y + v1.x*y + v1.y*v3.x - v1.y*x - v3.x*y + v3.y*x)*inv; // barycentric coordinates of the top-left corner of the tile.
float lambda2 = (v1.x*v2.y - v1.x*y - v1.y*v2.x + v1.y*x + v2.x*y - v2.y*x)*inv;
float dx1 = (v3.y - v1.y)*inv, dx2 = (v1.y - v2.y)*inv; // barycentric coordinates of a point increase by this value when x increases by 1.
int index = (pos.y + (screen_height / 2)) * screen_width + (pos.x + (screen_width / 2));
// test if top-left pixel is inside triangle's projection
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

++index; // update pixel index
lambda1 += dx1; // get barycentric coordinates for point (x+1,y)
lambda2 += dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

index+=screen_width;
lambda1 += (v1.x - v3.x)*inv; // same as dx1,dx2 but for changing y.
lambda2 += (v2.x - v1.x)*inv; // (x+1,y+1)
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}

--index;
lambda1 -= dx1; // (x,y+1)
lambda2 -= dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
}

[/code]
It tried to make it even faster, so I did something similar but for bigger tiles (4x4,...,8x8) but the results were very underwhelming. My idea was to check if any of the tile's corners lies inside the triangle's projection. If not, the tile is skipped and no computations are done for the remaining pixels inside it.  Otherwise, I do computations for all the pixels inside the tile, in the following order:
[img]https://i.stack.imgur.com/SvtH4.png[/img]
Here's the code:
[code]__kernel void draw(__global float * depthBuffer, __global int * colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {
int tileW = 4, tileH = 4;
int x = tileW * (get_global_id(0)) + minX;
int y = tileH * (get_global_id(1)) + minY;

float lambda1 = (-v1.x * v3.y + v1.x * y + v1.y * v3.x - v1.y * x - v3.x * y + v3.y * x) * inv;
float lambda2 = (v1.x * v2.y - v1.x * y - v1.y * v2.x + v1.y * x + v2.x * y - v2.y * x) * inv;
// the coords of top-left pixel will be useful if the tile overlaps the triangle's projection
float old1 = lambda1, old2 = lambda2;
float dx1 = (v3.y - v1.y) * inv, dx2 = (v1.y - v2.y) * inv;
float dy1 = (v1.x - v3.x) * inv, dy2 = (v2.x - v1.x) * inv; // how barycentric coords change

if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2  depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
if ((!level && tx == tileW - 1) || (level && tx == 0)) {
++ty;
index += screen_width;
lambda1 += dy1;
lambda2 += dy2;
level ^= 1;
} else if (!level) {
++tx;
++index;
lambda1 += dx1;
lambda2 += dx2;
} else {
--tx;
--index;
lambda1 -= dx1;
lambda2 -= dx2;
}
}
}
[/code]
The performance was at best slightly slower than the previous approach, and only when I set tile size to 2x2 (which defeats the purpose). For bigger tile sizes it just was getting worse and worse.
I don't understand these results. Fewer pixels that lie outside the triangle are tested and I use the full formula for barycentric coordinates for fewer pixels in the second part of the code (Only for the top-left pixel inside a tile).
Can someone explain why this is happening and how to further optimize my code? I'm running this on Ubuntu with the integrated GPU that comes with i5-1240p.
 

Источник: [url]https://stackoverflow.com/questions/78132631/why-did-an-opencl-rendering-optimization-make-my-code-slower[/url]

Ответить

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Вернуться в «C++»

Programmiererforum