SSE4.1 медленнее, чем SSE3 при умножении матрицы 4x4?C++

Программы на C++. Форум разработчиков
Ответить Пред. темаСлед. тема
Anonymous
 SSE4.1 медленнее, чем SSE3 при умножении матрицы 4x4?

Сообщение Anonymous »

Итак, я имею эту реализацию SSE3 для умножения матрицы: < /p>
/**
* Loop is unwrapped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

// Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
// This results in [x + 1, y + 5, z + 9, d + 10]
__m128 mul = _mm_mul_ps(a0, b0);
// Perform horizontal addition to sum of all of these values
// This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// Retrieve the result into result[0]
result[0] = _mm_cvtss_f32(mul);

// Perform the same for the rest of the matrix elements

mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Запуск этой функции в 1 000 000 раз приводит к скорости ~ 0,04 секунды. Это нужно: < /p>
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
< /code>
, но вместо этого: < /p>
1. Single Dot product
< /code>
Вот реализация SSE4.1: < /p>
* Loop is unwrapped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

__m128 mul;

// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1); // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result

mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Результат был: ~ 0,15 секунды !!! Это даже медленнее, чем моя реализация, которая не использует внутреннюю часть (~ 0,11 - 0,12 секунды) и та, которая использует SSE2 (~ 0,10 - 0,90 секунды). Что происходит? Это связано с тем, как точечный продукт реализован на более низком уровне или я делаю что -то не так? /code>.

Подробнее здесь: https://stackoverflow.com/questions/794 ... iplication
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «C++»