Итак, я имею эту реализацию SSE3 для умножения матрицы: < /p>
/**
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]
// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);
// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);
// Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
// This results in [x + 1, y + 5, z + 9, d + 10]
__m128 mul = _mm_mul_ps(a0, b0);
// Perform horizontal addition to sum of all of these values
// This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// Retrieve the result into result[0]
result[0] = _mm_cvtss_f32(mul);
// Perform the same for the rest of the matrix elements
mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);
mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Запуск этой функции 1.000.000 раз приводит к скорости ~ 0,04 секунды
Теперь я думал, что использование точечного продукта ускорит ситуацию, так как у меня нет к: < /p>
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
< /code>
, но вместо этого: < /p>
1. Single Dot product
< /code>
Вот реализация SSE4.1: < /p>
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]
// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);
// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);
__m128 mul;
// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1); // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result
mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);
mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Результат был: ~ 0,15 секунды !!! Это даже медленнее, чем моя реализация, которая не использует внутренние данные (~ 0,11-0,12 секунды) и та, которая использует SSE2 (~ 0,10-0,9 секунды). Что происходит?? Это связано с тем, как точечный продукт реализован на нижнем уровне или что -то чертовски?>
Подробнее здесь: https://stackoverflow.com/questions/794 ... iplication
SSE4.1 медленнее, чем SSE3 на умножении матрицы 4x4? ⇐ C++
-
- Похожие темы
- Ответы
- Просмотры
- Последнее сообщение