C++ Unix, почему блокировка мьютекса с использованием двух потоков на разных ядрах происходит в 5 раз медленнее, чем ког

C++ Unix, почему блокировка мьютекса с использованием двух потоков на разных ядрах происходит в 5 раз медленнее, чем ког ⇐ C++

1 сообщение • Страница 1 из 1

Anonymous

C++ Unix, почему блокировка мьютекса с использованием двух потоков на разных ядрах происходит в 5 раз медленнее, чем ког

Цитата

Сообщение Anonymous » 13 ноя 2025, 14:20

У меня есть приведенная ниже программа для измерения стоимости Condition_variable::wait(lock, Predicate). Я тестирую второй сценарий:

Оба потока закреплены за одним ядром.
1 поток закреплен за ядром 0, другой — за ядром 1.

Ubuntu 20.04, g++13: g++ -O3 -std=c++17 main.cpp -o main
=== Ping-Pong Context Switch Latency ===
Iterations: 10000, core = 0, total time = 45566 us, approx 2278.3 ns per context switch
Iterations: 50000, core = 0, total time = 242373 us, approx 2423.73 ns per context switch
Iterations: 100000, core = 0, total time = 505572 us, approx 2527.86 ns per context switch
Iterations: 10000, core = 1, total time = 165995 us, approx 8299.75 ns per context switch
Iterations: 50000, core = 1, total time = 834319 us, approx 8343.19 ns per context switch
Iterations: 100000, core = 1, total time = 1602175 us, approx 8010.88 ns per context switch

Второй случай в 5 раз медленнее первого. Что именно вызывает эту разницу? В основном это связано с перезагрузкой кэша (каждый раз при нажатии ^= 1 строка кэша вытесняется из другого кэша ЦП) или чем-то еще?
Не похоже, что затраты связаны с вытеснением кэша, поскольку дополнительные затраты составляют порядка ~6000 нс/итерация, что на 1,5 порядка больше, чем типичная стоимость промаха кэша.
#include
#include

class MyTimer {
private:
std::chrono::time_point starter;

public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}

int64_t getCounterNs() {
return std::chrono::duration_cast(std::chrono::steady_clock::now() - starter).count();
}

float getCounterMsPrecise() {
return getCounterNs() / 1000000.0;
}
};

#include
#include
#include
#include
#include
#include
#include

#include // for pthread_setaffinity_np
#include // for cpu_set_t, CPU_SET, CPU_ZERO

using namespace std;

// ============================================================================
// Helper: pin current thread to a specific CPU core (Linux-only)
// ============================================================================
void pin_to_cpu(int cpu_index)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu_index, &cpuset);

pthread_t current = pthread_self();
int rc = pthread_setaffinity_np(current, sizeof(cpu_set_t), &cpuset);
(void)rc; // ignore errors in this demo; on failure the thread is just unpinned
}

long long measure_ping_pong(int iterations, int core = 0)
{
// Shared state between the two threads
struct Shared {
mutex m;
condition_variable cv;
bool turn = false; // false = A's turn, true = B's turn
bool start = false;
} shared;

thread thread1([&] {
pin_to_cpu(0);
unique_lock lk(shared.m);
shared.cv.wait(lk, [&] {
return shared.start == true;
});
lk.unlock();

for (int i = 0; i < iterations; i++) {
lk.lock();
shared.cv.wait(lk, [&] {
return shared.turn == false;
});
shared.turn ^= 1;
lk.unlock();
shared.cv.notify_one();
}
});

thread thread2([&] {
pin_to_cpu(core);
unique_lock lk(shared.m);
shared.cv.wait(lk, [&] {
return shared.start == true;
});
lk.unlock();

for (int i = 0; i < iterations; i++) {
lk.lock();
shared.cv.wait(lk, [&] {
return shared.turn == true;
});
shared.turn ^= 1;
lk.unlock();
shared.cv.notify_one();
}
});

using clock = std::chrono::steady_clock;
auto t0 = clock::now();

unique_lock lk(shared.m);
shared.start = true;
lk.unlock();
shared.cv.notify_all();
thread1.join();
thread2.join();

auto t1 = clock::now();
long long us = std::chrono::duration_cast(t1 - t0).count();
return us;
}

int main()
{
cout Args({50'000, 0})
// ->Args({100'000, 0})
// ->Args({10'000, 1})
// ->Args({50'000, 1})
// ->Args({100'000, 1});

Подробнее здесь: https://stackoverflow.com/questions/798 ... r-than-whe

1763032848

Anonymous

У меня есть приведенная ниже программа для измерения стоимости Condition_variable::wait(lock, Predicate). Я тестирую второй сценарий:
[list]
[*]Оба потока закреплены за одним ядром.
[*]1 поток закреплен за ядром 0, другой — за ядром 1.
[/list]
Ubuntu 20.04, g++13: g++ -O3 -std=c++17 main.cpp -o main
=== Ping-Pong Context Switch Latency ===
Iterations: 10000, core = 0, total time = 45566 us, approx 2278.3 ns per context switch
Iterations: 50000, core = 0, total time = 242373 us, approx 2423.73 ns per context switch
Iterations: 100000, core = 0, total time = 505572 us, approx 2527.86 ns per context switch
Iterations: 10000, core = 1, total time = 165995 us, approx 8299.75 ns per context switch
Iterations: 50000, core = 1, total time = 834319 us, approx 8343.19 ns per context switch
Iterations: 100000, core = 1, total time = 1602175 us, approx 8010.88 ns per context switch

Второй случай в 5 раз медленнее первого.  Что именно вызывает эту разницу? В основном это связано с перезагрузкой кэша (каждый раз при нажатии ^= 1 строка кэша вытесняется из другого кэша ЦП) или чем-то еще?
Не похоже, что затраты связаны с вытеснением кэша, поскольку дополнительные затраты составляют порядка ~6000 нс/итерация, что на 1,5 порядка больше, чем типичная стоимость промаха кэша.
#include 
#include 

class MyTimer {
private:
std::chrono::time_point starter;

public:
void startCounter() {
starter = std::chrono::steady_clock::now();
}

int64_t getCounterNs() {
return std::chrono::duration_cast(std::chrono::steady_clock::now() - starter).count();
}

float getCounterMsPrecise() {
return getCounterNs() / 1000000.0;
}
};

#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include    // for pthread_setaffinity_np
#include      // for cpu_set_t, CPU_SET, CPU_ZERO

using namespace std;

// ============================================================================
// Helper: pin current thread to a specific CPU core (Linux-only)
// ============================================================================
void pin_to_cpu(int cpu_index)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu_index, &cpuset);

pthread_t current = pthread_self();
int rc = pthread_setaffinity_np(current, sizeof(cpu_set_t), &cpuset);
(void)rc; // ignore errors in this demo; on failure the thread is just unpinned
}

long long measure_ping_pong(int iterations, int core = 0)
{
// Shared state between the two threads
struct Shared {
mutex m;
condition_variable cv;
bool turn = false; // false = A's turn, true = B's turn
bool start = false;
} shared;

thread thread1([&] {
pin_to_cpu(0);
unique_lock lk(shared.m);
shared.cv.wait(lk, [&] {
return shared.start == true;
});
lk.unlock();

for (int i = 0; i < iterations; i++) {
lk.lock();
shared.cv.wait(lk, [&] {
return shared.turn == false;
});
shared.turn ^= 1;
lk.unlock();
shared.cv.notify_one();
}
});

thread thread2([&] {
pin_to_cpu(core);
unique_lock lk(shared.m);
shared.cv.wait(lk, [&] {
return shared.start == true;
});
lk.unlock();

for (int i = 0; i < iterations; i++) {
lk.lock();
shared.cv.wait(lk, [&] {
return shared.turn == true;
});
shared.turn ^= 1;
lk.unlock();
shared.cv.notify_one();
}
});

using clock = std::chrono::steady_clock;
auto t0 = clock::now();

unique_lock lk(shared.m);
shared.start = true;
lk.unlock();
shared.cv.notify_all();
thread1.join();
thread2.join();

auto t1 = clock::now();
long long us = std::chrono::duration_cast(t1 - t0).count();
return us;
}

int main()
{
cout Args({50'000, 0})
//     ->Args({100'000, 0})
//     ->Args({10'000, 1})
//     ->Args({50'000, 1})
//     ->Args({100'000, 1});
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79818776/c-unix-why-mutex-lock-using-2-threads-on-different-cores-is-5x-slower-than-whe[/url]