Ошибка CUDA 101 с позиционированием cudaMemPrefetchAsync на WSL2

Ошибка CUDA 101 с позиционированием cudaMemPrefetchAsync на WSL2 ⇐ C++

1 сообщение • Страница 1 из 1

Anonymous

Ошибка CUDA 101 с позиционированием cudaMemPrefetchAsync на WSL2

Цитата

Сообщение Anonymous » 29 май 2024, 05:45

Я запускаю эти фрагменты на WSL с использованием CUDA 12.1.
snip1 возвращает ошибку CUDA 101: неверный порядковый номер устройства, а snip2 работает без проблем.
Разница между этими фрагментами заключается только в позициях cudaMemPrefetchAsync вызывает.
Эта разница обусловлена исключительно положением cudaMemPrefetchAsync? Каков механизм такого поведения?
// snip1
#include

#include

void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%d\n", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
int deviceId;
int numberOfSMs;

cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));

/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);

init_curand(states, time(NULL));
checkError();

return 0;
}

// snip2
#include

#include

void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%d\n", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
int deviceId;
int numberOfSMs;

cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);

curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));

init_curand(states, time(NULL));
checkError();

/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);

return 0;
}

Подробнее здесь: https://stackoverflow.com/questions/785 ... ng-on-wsl2

1716950717

Anonymous

Я запускаю эти фрагменты на WSL с использованием CUDA 12.1.
snip1 возвращает ошибку CUDA 101: неверный порядковый номер устройства, а snip2 работает без проблем.
Разница между этими фрагментами заключается только в позициях cudaMemPrefetchAsync  вызывает.
Эта разница обусловлена исключительно положением cudaMemPrefetchAsync? Каков механизм такого поведения?
// snip1
#include 

#include 

void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%d\n", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
int deviceId;
int numberOfSMs;

cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);
curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));

/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);

init_curand(states, time(NULL));
checkError();

return 0;
}

// snip2
#include 

#include 

void checkError()
{
cudaError_t err_;
err_ = cudaGetLastError();
if (err_ != cudaSuccess)
{
std::printf("CUDA error %d:%s at %s:%d\n", err_, cudaGetErrorString(err_), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
}

__global__ void init_curand(curandState *states, unsigned long long seed)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
int idx = i * blockDim.y * gridDim.y + j;
curand_init(seed, idx, 0, &states[idx]);
}

int main()
{
int deviceId;
int numberOfSMs;

cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);

dim3 threadsPerBlock(16, 16);
dim3 numBlocks(8 * numberOfSMs, 8 * numberOfSMs);

const int M = 10;
const int N = 20;
const int bytes = M * N * sizeof(int8_t);
int8_t *noisy;
int8_t *ising1;
int8_t *ising2;
cudaMallocManaged(&noisy, bytes);
cudaMallocManaged(&ising1, bytes);
cudaMallocManaged(&ising2, bytes);

curandState *states;
cudaMalloc(&states, numBlocks.x * threadsPerBlock.x * numBlocks.y * threadsPerBlock.y *
sizeof(curandState));

init_curand(states, time(NULL));
checkError();

/* ??? */
cudaMemPrefetchAsync(noisy, bytes, deviceId);
cudaMemPrefetchAsync(ising1, bytes, deviceId);
cudaMemPrefetchAsync(ising2, bytes, deviceId);

return 0;
}
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78542341/cuda-error-101-with-cudamemprefetchasync-positioning-on-wsl2[/url]