Я тестирую несколько очень простых ядер CUDA SGEMM на графическом процессоре NVIDIA Hopper (H800, sm_90) и заметил кое-что, чего не до конца понимаю.
У меня есть два ядра, которые, насколько я понимаю, должны иметь по существу один и тот же шаблон доступа к глобальной памяти, но «объединенная» версия постоянно превосходит явно «вторую наивную» версию.
Ниже воспроизводимый коды.
#include
#include
#include
#include
#include
#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
// ==================== First naive version ====================
__global__ void sgemm_naive1(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < M && y < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[x * K + i] * B[i * N + y];
}
C[x * N + y] = alpha * tmp + beta * C[x * N + y];
}
}
void run_sgemm_naive1(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32, 32);
sgemm_naive1(M, N, K, alpha, A, B, beta, C);
}
// ==================== Second naive version (better performance) ====================
__global__ void sgemm_naive2(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < M && y < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[y * K + i] * B[i * N + x];
}
C[y * N + x] = alpha * tmp + beta * C[y * N + x];
}
}
void run_sgemm_naive2(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32, 32);
sgemm_naive2(M, N, K, alpha, A, B, beta, C);
}
// ==================== Global memory coalesced version ====================
template
__global__ void sgemm_global_mem_coalesce(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
if (cRow < M && cCol < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[cRow * K + i] * B[i * N + cCol];
}
C[cRow * N + cCol] = alpha * tmp + beta * C[cRow * N + cCol];
}
}
void run_sgemm_coalesce(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32 * 32);
sgemm_global_mem_coalesce(M, N, K, alpha, A, B, beta, C);
}
// ==================== Verification function ====================
void verify_results(float *C1, float *C2, int M, int N, const char *label) {
float max_diff = 0.0f;
int diff_count = 0;
float tolerance = 1e-3f;
for (int i = 0; i < M * N; i++) {
float diff = fabs(C1 - C2);
if (diff > max_diff) max_diff = diff;
if (diff > tolerance) diff_count++;
}
printf("%s verification: max difference = %e, mismatch count = %d (tolerance %e)\n",
label, max_diff, diff_count, tolerance);
}
// ==================== Performance benchmark ====================
void benchmark_kernel(void (*kernel_func)(int, int, int, float, float*, float*, float, float*),
int M, int N, int K,
float *d_A, float *d_B, float *d_C,
const char *kernel_name,
int warmup_runs = 10,
int test_runs = 100) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float alpha = 1.0f;
float beta = 0.0f;
// Warm-up
for (int i = 0; i < warmup_runs; i++) {
kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C);
}
cudaDeviceSynchronize();
// Timed runs
float total_time = 0.0f;
for (int i = 0; i < test_runs; i++) {
cudaEventRecord(start);
kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_time += milliseconds;
}
float avg_time = total_time / test_runs;
float gflops = 2.0 * M * N * K / (avg_time * 1e6); // 2 FLOPs per MAC
printf("Kernel: %s\n", kernel_name);
printf(" Average time: %.3f ms\n", avg_time);
printf(" Performance: %.2f GFLOPS\n", gflops);
printf(" Total runs: %d\n\n", test_runs);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
// ==================== Main ====================
int main() {
// Matrix dimensions
int M = 1024;
int N = 1024;
int K = 1024;
printf("Matrix multiplication performance test\n");
printf("M = %d, N = %d, K = %d\n", M, N, K);
printf("Total computation: %.2f GFLOPs\n\n", 2.0 * M * N * K / 1e9);
// Allocate host memory
size_t size_A = M * K * sizeof(float);
size_t size_B = K * N * sizeof(float);
size_t size_C = M * N * sizeof(float);
float *h_A = (float*)malloc(size_A);
float *h_B = (float*)malloc(size_B);
float *h_C1 = (float*)malloc(size_C);
float *h_C2 = (float*)malloc(size_C);
float *h_C3 = (float*)malloc(size_C);
// Initialize matrices
srand(time(NULL));
for (int i = 0; i < M * K; i++) h_A = (float)rand() / RAND_MAX;
for (int i = 0; i < K * N; i++) h_B = (float)rand() / RAND_MAX;
for (int i = 0; i < M * N; i++) h_C1 = (float)rand() / RAND_MAX;
memcpy(h_C2, h_C1, size_C);
memcpy(h_C3, h_C1, size_C);
// Allocate device memory
float *d_A, *d_B, *d_C1, *d_C2, *d_C3;
cudaMalloc(&d_A, size_A);
cudaMalloc(&d_B, size_B);
cudaMalloc(&d_C1, size_C);
cudaMalloc(&d_C2, size_C);
cudaMalloc(&d_C3, size_C);
// Copy data to device
cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
// GPU info
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("GPU: %s, Compute Capability: %d.%d\n\n",
prop.name, prop.major, prop.minor);
// Run benchmarks
printf("=== Performance Benchmark ===\n");
cudaMemcpy(d_C1, h_C1, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_naive1, M, N, K, d_A, d_B, d_C1, "Naive Version 1");
cudaMemcpy(d_C2, h_C2, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_naive2, M, N, K, d_A, d_B, d_C2, "Naive Version 2");
cudaMemcpy(d_C3, h_C3, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_coalesce, M, N, K, d_A, d_B, d_C3, "Coalesced Version");
// Verify results
printf("=== Result Verification ===\n");
cudaMemcpy(h_C1, d_C1, size_C, cudaMemcpyDeviceToHost);
cudaMemcpy(h_C2, d_C2, size_C, cudaMemcpyDeviceToHost);
cudaMemcpy(h_C3, d_C3, size_C, cudaMemcpyDeviceToHost);
verify_results(h_C1, h_C3, M, N, "Kernel1 vs Kernel3");
verify_results(h_C1, h_C2, M, N, "Kernel1 vs Kernel2");
// Cleanup
free(h_A);
free(h_B);
free(h_C1);
free(h_C2);
free(h_C3);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C1);
cudaFree(d_C2);
cudaFree(d_C3);
printf("\nTest completed!\n");
return 0;
}
Результат на моем компьютере:
=== Performance Benchmark ===
Kernel: Naive Version 1
Average time: 4.429 ms
Performance: 484.88 GFLOPS
Total runs: 100
Kernel: Naive Version 2
Average time: 0.461 ms
Performance: 4661.59 GFLOPS
Total runs: 100
Kernel: Coalesced Version
Average time: 0.344 ms
Performance: 6236.03 GFLOPS
Total runs: 100
Подробнее здесь: https://stackoverflow.com/questions/798 ... -coalesced
Почему мое «второе наивное» ядро SGEMM быстрее, чем версия с «объединенной глобальной памятью»? ⇐ C++
Программы на C++. Форум разработчиков
1766159798
Anonymous
Я тестирую несколько очень простых ядер CUDA SGEMM на графическом процессоре NVIDIA Hopper (H800, sm_90) и заметил кое-что, чего не до конца понимаю.
У меня есть два ядра, которые, насколько я понимаю, должны иметь по существу один и тот же шаблон доступа к глобальной памяти, но «объединенная» версия постоянно превосходит явно «вторую наивную» версию.
Ниже воспроизводимый коды.
#include
#include
#include
#include
#include
#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
// ==================== First naive version ====================
__global__ void sgemm_naive1(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < M && y < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[x * K + i] * B[i * N + y];
}
C[x * N + y] = alpha * tmp + beta * C[x * N + y];
}
}
void run_sgemm_naive1(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32, 32);
sgemm_naive1(M, N, K, alpha, A, B, beta, C);
}
// ==================== Second naive version (better performance) ====================
__global__ void sgemm_naive2(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const uint x = blockIdx.x * blockDim.x + threadIdx.x;
const uint y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < M && y < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[y * K + i] * B[i * N + x];
}
C[y * N + x] = alpha * tmp + beta * C[y * N + x];
}
}
void run_sgemm_naive2(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32, 32);
sgemm_naive2(M, N, K, alpha, A, B, beta, C);
}
// ==================== Global memory coalesced version ====================
template
__global__ void sgemm_global_mem_coalesce(int M, int N, int K, float alpha,
const float *A, const float *B,
float beta, float *C) {
const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);
if (cRow < M && cCol < N) {
float tmp = 0.0f;
for (int i = 0; i < K; ++i) {
tmp += A[cRow * K + i] * B[i * N + cCol];
}
C[cRow * N + cCol] = alpha * tmp + beta * C[cRow * N + cCol];
}
}
void run_sgemm_coalesce(int M, int N, int K, float alpha,
float *A, float *B, float beta, float *C) {
dim3 gridDim(CEIL_DIV(M, 32), CEIL_DIV(N, 32));
dim3 blockDim(32 * 32);
sgemm_global_mem_coalesce(M, N, K, alpha, A, B, beta, C);
}
// ==================== Verification function ====================
void verify_results(float *C1, float *C2, int M, int N, const char *label) {
float max_diff = 0.0f;
int diff_count = 0;
float tolerance = 1e-3f;
for (int i = 0; i < M * N; i++) {
float diff = fabs(C1[i] - C2[i]);
if (diff > max_diff) max_diff = diff;
if (diff > tolerance) diff_count++;
}
printf("%s verification: max difference = %e, mismatch count = %d (tolerance %e)\n",
label, max_diff, diff_count, tolerance);
}
// ==================== Performance benchmark ====================
void benchmark_kernel(void (*kernel_func)(int, int, int, float, float*, float*, float, float*),
int M, int N, int K,
float *d_A, float *d_B, float *d_C,
const char *kernel_name,
int warmup_runs = 10,
int test_runs = 100) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float alpha = 1.0f;
float beta = 0.0f;
// Warm-up
for (int i = 0; i < warmup_runs; i++) {
kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C);
}
cudaDeviceSynchronize();
// Timed runs
float total_time = 0.0f;
for (int i = 0; i < test_runs; i++) {
cudaEventRecord(start);
kernel_func(M, N, K, alpha, d_A, d_B, beta, d_C);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
total_time += milliseconds;
}
float avg_time = total_time / test_runs;
float gflops = 2.0 * M * N * K / (avg_time * 1e6); // 2 FLOPs per MAC
printf("Kernel: %s\n", kernel_name);
printf(" Average time: %.3f ms\n", avg_time);
printf(" Performance: %.2f GFLOPS\n", gflops);
printf(" Total runs: %d\n\n", test_runs);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
// ==================== Main ====================
int main() {
// Matrix dimensions
int M = 1024;
int N = 1024;
int K = 1024;
printf("Matrix multiplication performance test\n");
printf("M = %d, N = %d, K = %d\n", M, N, K);
printf("Total computation: %.2f GFLOPs\n\n", 2.0 * M * N * K / 1e9);
// Allocate host memory
size_t size_A = M * K * sizeof(float);
size_t size_B = K * N * sizeof(float);
size_t size_C = M * N * sizeof(float);
float *h_A = (float*)malloc(size_A);
float *h_B = (float*)malloc(size_B);
float *h_C1 = (float*)malloc(size_C);
float *h_C2 = (float*)malloc(size_C);
float *h_C3 = (float*)malloc(size_C);
// Initialize matrices
srand(time(NULL));
for (int i = 0; i < M * K; i++) h_A[i] = (float)rand() / RAND_MAX;
for (int i = 0; i < K * N; i++) h_B[i] = (float)rand() / RAND_MAX;
for (int i = 0; i < M * N; i++) h_C1[i] = (float)rand() / RAND_MAX;
memcpy(h_C2, h_C1, size_C);
memcpy(h_C3, h_C1, size_C);
// Allocate device memory
float *d_A, *d_B, *d_C1, *d_C2, *d_C3;
cudaMalloc(&d_A, size_A);
cudaMalloc(&d_B, size_B);
cudaMalloc(&d_C1, size_C);
cudaMalloc(&d_C2, size_C);
cudaMalloc(&d_C3, size_C);
// Copy data to device
cudaMemcpy(d_A, h_A, size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size_B, cudaMemcpyHostToDevice);
// GPU info
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
printf("GPU: %s, Compute Capability: %d.%d\n\n",
prop.name, prop.major, prop.minor);
// Run benchmarks
printf("=== Performance Benchmark ===\n");
cudaMemcpy(d_C1, h_C1, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_naive1, M, N, K, d_A, d_B, d_C1, "Naive Version 1");
cudaMemcpy(d_C2, h_C2, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_naive2, M, N, K, d_A, d_B, d_C2, "Naive Version 2");
cudaMemcpy(d_C3, h_C3, size_C, cudaMemcpyHostToDevice);
benchmark_kernel(run_sgemm_coalesce, M, N, K, d_A, d_B, d_C3, "Coalesced Version");
// Verify results
printf("=== Result Verification ===\n");
cudaMemcpy(h_C1, d_C1, size_C, cudaMemcpyDeviceToHost);
cudaMemcpy(h_C2, d_C2, size_C, cudaMemcpyDeviceToHost);
cudaMemcpy(h_C3, d_C3, size_C, cudaMemcpyDeviceToHost);
verify_results(h_C1, h_C3, M, N, "Kernel1 vs Kernel3");
verify_results(h_C1, h_C2, M, N, "Kernel1 vs Kernel2");
// Cleanup
free(h_A);
free(h_B);
free(h_C1);
free(h_C2);
free(h_C3);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C1);
cudaFree(d_C2);
cudaFree(d_C3);
printf("\nTest completed!\n");
return 0;
}
Результат на моем компьютере:
=== Performance Benchmark ===
Kernel: Naive Version 1
Average time: 4.429 ms
Performance: 484.88 GFLOPS
Total runs: 100
Kernel: Naive Version 2
Average time: 0.461 ms
Performance: 4661.59 GFLOPS
Total runs: 100
Kernel: Coalesced Version
Average time: 0.344 ms
Performance: 6236.03 GFLOPS
Total runs: 100
Подробнее здесь: [url]https://stackoverflow.com/questions/79851249/why-is-my-second-naive-sgemm-kernel-faster-than-the-global-memory-coalesced[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия