SSE4.1 медленнее, чем SSE3 на умножении матрицы 4x4? - Цифровое Кемерово

SSE4.1 медленнее, чем SSE3 на умножении матрицы 4x4? ⇐ C++

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Anonymous

SSE4.1 медленнее, чем SSE3 на умножении матрицы 4x4?

Цитата

Сообщение Anonymous » 22 фев 2025, 00:01

Итак, я имею эту реализацию SSE3 для умножения матрицы: < /p>
/**
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

// Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
// This results in [x + 1, y + 5, z + 9, d + 10]
__m128 mul = _mm_mul_ps(a0, b0);
// Perform horizontal addition to sum of all of these values
// This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// Retrieve the result into result[0]
result[0] = _mm_cvtss_f32(mul);

// Perform the same for the rest of the matrix elements

mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Запуск этой функции 1.000.000 раз приводит к скорости ~ 0,04 секунды
Теперь я думал, что использование точечного продукта ускорит ситуацию, так как у меня нет к: < /p>
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
< /code>
, но вместо этого: < /p>
1. Single Dot product
< /code>
Вот реализация SSE4.1: < /p>
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

__m128 mul;

// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1); // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result

mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Результат был: ~ 0,15 секунды !!! Это даже медленнее, чем моя реализация, которая не использует внутренние данные (~ 0,11-0,12 секунды) и та, которая использует SSE2 (~ 0,10-0,9 секунды). Что происходит?? Это связано с тем, как точечный продукт реализован на нижнем уровне или что -то чертовски?>

Подробнее здесь: https://stackoverflow.com/questions/794 ... iplication

Реклама

1740171712

Anonymous

 Итак, я имею эту реализацию SSE3 для умножения матрицы: < /p>
/**
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout  b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at->  b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

// Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
// This results in [x + 1, y + 5, z + 9, d + 10]
__m128 mul = _mm_mul_ps(a0, b0);
// Perform horizontal addition to sum of all of these values
// This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// Retrieve the result into result[0]
result[0] = _mm_cvtss_f32(mul);

// Perform the same for the rest of the matrix elements

mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Запуск этой функции 1.000.000 раз приводит к скорости ~ 0,04 секунды
Теперь я думал, что использование точечного продукта ускорит ситуацию, так как у меня нет к: < /p>
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
< /code>
, но вместо этого: < /p>
1.  Single Dot product
< /code>
Вот реализация SSE4.1: < /p>
 * Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout  b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

__m128 mul;

// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1);  // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result

mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Результат был: ~ 0,15 секунды !!! Это даже медленнее, чем моя реализация, которая не использует внутренние данные (~ 0,11-0,12 секунды) и та, которая использует SSE2 (~ 0,10-0,9 секунды). Что происходит?? Это связано с тем, как точечный продукт реализован на нижнем уровне или что -то чертовски?> 

Подробнее здесь: [url]https://stackoverflow.com/questions/79458611/sse4-1-slower-than-sse3-on-matrix-4x4-multiplication[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

SSE4.1 медленнее, чем SSE3 на умножении матрицы 4x4?

Последнее сообщение Anonymous « 22 фев 2025, 01:15
Добавлено в форуме C++

Anonymous » 22 фев 2025, 01:15 » в форуме C++

Итак, я имею эту реализацию SSE3 для умножения матрицы:
/**
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float...

0 Ответы

7 Просмотры

Последнее сообщение Anonymous
22 фев 2025, 01:15
SSE4.1 медленнее, чем SSE3 при умножении матрицы 4x4?

Последнее сообщение Anonymous « 22 фев 2025, 23:52
Добавлено в форуме C++

Anonymous » 22 фев 2025, 23:52 » в форуме C++

Итак, я имею эту реализацию SSE3 для умножения матрицы:
/**
* Loop is unwrapped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float...

0 Ответы

5 Просмотры

Последнее сообщение Anonymous
22 фев 2025, 23:52
Как сравнить однородные матрицы преобразования 4x4: матрица GT (с точкой поворота) и матрица ICP (с центроидом)

Последнее сообщение Anonymous « 08 окт 2024, 23:46
Добавлено в форуме Python

Anonymous » 08 окт 2024, 23:46 » в форуме Python

Я хочу вычислить ошибку вращения/перевода между двумя матрицами преобразования 4x4 gt_mat и est_mat, используя приведенный ниже код Python:
def get_angular_error(R_gt, R_est):
import math

Get angular error

try:
A = (np.trace(np.dot(R_gt.T,...

0 Ответы

24 Просмотры

Последнее сообщение Anonymous
08 окт 2024, 23:46
Почему сборка разреженной матрицы выполняется медленнее, чем сборка плотной (полной) матрицы?

Последнее сообщение Anonymous « 20 янв 2025, 20:43
Добавлено в форуме Python

Anonymous » 20 янв 2025, 20:43 » в форуме Python

У меня есть две функции (i) getKglobal и (ii) getKglobal_Sp. Однако время для разреженной функции примерно в 5 раз медленнее, чем для полноматричной формулировки. Я понимаю, что добиться производительности от разреженности может быть непросто. Но...

0 Ответы

24 Просмотры

Последнее сообщение Anonymous
20 янв 2025, 20:43
Почему сборка разреженной матрицы выполняется медленнее, чем сборка плотной (полной) матрицы?

Последнее сообщение Anonymous « 20 янв 2025, 22:09
Добавлено в форуме Python

Anonymous » 20 янв 2025, 22:09 » в форуме Python

У меня есть две функции (i) getKglobal и (ii) getKglobal_Sp. Однако время для разреженной функции примерно в 5 раз медленнее, чем для полноматричной формулировки. Я понимаю, что добиться производительности от разреженности может быть непросто. Но...

0 Ответы

18 Просмотры

Последнее сообщение Anonymous
20 янв 2025, 22:09

Вернуться в «C++»

Programmiererforum