I'm trying to optimize my 3D rendering OpenCL code. Currently, I render a 3D triangle by dividing its bounding box into 2x2 tiles and doing necessary computations for each pixel inside the tile. This turned out to be slightly faster than giving each work item just a single pixel, because based on the barycentric coordinates of point (x,y), it's easy to update them and get the coordinates for (x+1,y) etc. Here's the implementation of this idea:
Код: Выделить всё
typedef float3 vec3;
// v1,v2,v3 are (x,y,1/z) coordinates of the triangle's projection onto the screen
__kernel void draw(__global float* depthBuffer,__global int* colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {
// do all the computations for a 2x2px tile.
// inv is the precomputed denominator in the barycentric coords formula:
// 1/(v1.x*v2.y - v1.x*v3.y - v1.y*v2.x + v1.y*v3.x + v2.x*v3.y - v2.y*v3.x)
// x,y of top-left corner of tile:
int x = 2*(get_global_id(0))+minX; // minX,minY - top left of triangle bounding box
int y = 2*(get_global_id(1))+minY; // 2 is the tile size
float lambda1 = (-v1.x*v3.y + v1.x*y + v1.y*v3.x - v1.y*x - v3.x*y + v3.y*x)*inv; // barycentric coordinates of the top-left corner of the tile.
float lambda2 = (v1.x*v2.y - v1.x*y - v1.y*v2.x + v1.y*x + v2.x*y - v2.y*x)*inv;
float dx1 = (v3.y - v1.y)*inv, dx2 = (v1.y - v2.y)*inv; // barycentric coordinates of a point increase by this value when x increases by 1.
int index = (pos.y + (screen_height / 2)) * screen_width + (pos.x + (screen_width / 2));
// test if top-left pixel is inside triangle's projection
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
++index; // update pixel index
lambda1 += dx1; // get barycentric coordinates for point (x+1,y)
lambda2 += dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
index+=screen_width;
lambda1 += (v1.x - v3.x)*inv; // same as dx1,dx2 but for changing y.
lambda2 += (v2.x - v1.x)*inv; // (x+1,y+1)
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
--index;
lambda1 -= dx1; // (x,y+1)
lambda2 -= dx2;
if(lambda1>=0 && lambda2>=0 && lambda1+lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
}

Here's the code:
Код: Выделить всё
__kernel void draw(__global float * depthBuffer, __global int * colorArray,
vec3 v1, vec3 v2, vec3 v3, int clr,
int screen_width, int screen_height, int minX, int minY, float inv) {
int tileW = 4, tileH = 4;
int x = tileW * (get_global_id(0)) + minX;
int y = tileH * (get_global_id(1)) + minY;
float lambda1 = (-v1.x * v3.y + v1.x * y + v1.y * v3.x - v1.y * x - v3.x * y + v3.y * x) * inv;
float lambda2 = (v1.x * v2.y - v1.x * y - v1.y * v2.x + v1.y * x + v2.x * y - v2.y * x) * inv;
// the coords of top-left pixel will be useful if the tile overlaps the triangle's projection
float old1 = lambda1, old2 = lambda2;
float dx1 = (v3.y - v1.y) * inv, dx2 = (v1.y - v2.y) * inv;
float dy1 = (v1.x - v3.x) * inv, dy2 = (v2.x - v1.x) * inv; // how barycentric coords change
if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 = 0 && lambda2 >= 0 && lambda1 + lambda2 depthBuffer[index]) {
depthBuffer[index] = f;
colorArray[index] = clr;
}
}
if ((!level && tx == tileW - 1) || (level && tx == 0)) {
++ty;
index += screen_width;
lambda1 += dy1;
lambda2 += dy2;
level ^= 1;
} else if (!level) {
++tx;
++index;
lambda1 += dx1;
lambda2 += dx2;
} else {
--tx;
--index;
lambda1 -= dx1;
lambda2 -= dx2;
}
}
}
I don't understand these results. Fewer pixels that lie outside the triangle are tested and I use the full formula for barycentric coordinates for fewer pixels in the second part of the code (Only for the top-left pixel inside a tile).
Can someone explain why this is happening and how to further optimize my code? I'm running this on Ubuntu with the integrated GPU that comes with i5-1240p.
Источник: https://stackoverflow.com/questions/781 ... ode-slower
Мобильная версия