Это правильная реализация полиморфизма с использованием CUDA

Это правильная реализация полиморфизма с использованием CUDA ⇐ C++

1 сообщение • Страница 1 из 1

Anonymous

Это правильная реализация полиморфизма с использованием CUDA

Цитата

Сообщение Anonymous » 20 фев 2025, 00:37

Мне нужно использовать полиморфные классы в моем коде CUDA. Я проанализировал несколько ответов на SO, Reddit, и у меня были бесконечные чаты с Chatgpt и Claude и я , я достиг решения. Далее следует моя интерпретация ответа, данного Робертом Кровеллой, с удаленной тягой. Мне не нужно толкать в этом конкретном контексте, и я стараюсь держать вещи как можно более простой. Ключевое понимание здесь заключается в том, что я использую комбинацию cudamallocmanaged () и размещать новый в ядро для правильного построения объекта на устройстве.
Насколько я могу судить, этот код работает так, как предполагается, и нет утечек памяти. Я хотел бы, чтобы кто-то был более разбрызгивающим, чем я, убедился, что мой подход является разумным, и указывать на любые потенциальные щедрости, на которые я могу установить. < /P>
#include
#include

namespace Shapes {

// define a Shape class
class Shape {
public:
__device__ virtual double area() = 0;
__device__ virtual ~Shape() {
printf("deleting Shape\n");
};
};

// define a circle
class Circle : public Shape {
private:
double radius;
public:
__device__ explicit Circle(double radius) : radius(radius) {}
__device__ ~Circle() override {
printf("deleting Circle\n");
}
__device__ double area() override {
return 3.1415926 * radius * radius;
}

};

// define a rectangle
class Rectangle : public Shape {
private:
double a;
double b;
public:
__device__ explicit Rectangle(double a, double b) : a(a), b(b) {}
__device__ ~Rectangle() override {
printf("deleting Rectangle\n");
}
__device__ double area() override {
return a*b;
}

};

// define an 'initialize' kernel that uses 'placement new' to to create a shape (T) at
// the specified pointer location.
// should be called with
template
__global__ void initialize(Shape *shape, Args... args) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx == 0) {
new(shape) T(args...);
}
}

// define a kernel to run the destructor
// should be called with
__global__ void deleteShape(Shape *shape) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx == 0 && shape != nullptr) {
shape->~Shape();
}
}

// define a template function to malloc the shape and initialize
// note: not doing a cudaDeviceSynchronize() at this point because
// we can fire off more initializations and then wait for all to complete
template
void create(Shape * &shape, Args... args) {
cudaMallocManaged(&shape, sizeof(T));
initialize(shape, args...);
}

// the reverse of our create() function. Calls the destructor kernel (on the device)
// and then free's the memory, setting it back to null for good measure
void destroy(Shape* &shape) {
if(shape != nullptr) {
deleteShape(shape);
cudaDeviceSynchronize();
cudaFree(shape);
shape = nullptr;
}
}

// just calculates the area of an array of shapes
__global__ void areaKernel(Shape **shape, int count, double *areas) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < count) {
areas[idx] = shape[idx]->area();
}
}

// helper function
int calculateBlocksPerGrid(int numItems, int threadsPerBlock) {
return (numItems + threadsPerBlock - 1) / threadsPerBlock;
}
}

TEST(CudaAnalyticsExamples, placementNewInit) {

using namespace Shapes;
Shape **shapes;
int count = 3;
cudaMallocManaged(&shapes, count * sizeof(Shape *));

for(int i = 0; i < count; i++) {
if(i % 2 == 0) {
// even indices get circles, odd indices get rectangles
create(shapes, 1.0);
} else {
create(shapes, 1.0, 2.0);
}
}
// wait for all initialization to complete
cudaDeviceSynchronize();

// allocate the area array
double *area;
cudaMallocManaged(&area, count*sizeof(double));

// launch the area kernel
int threadsPerBlock = 32;
int blocks = calculateBlocksPerGrid(count, threadsPerBlock);
areaKernel(shapes, count, area);
cudaDeviceSynchronize();

// print out areas
for(int i = 0; i < count; i++) {
std::cout
Мое единственное беспокойство заключается в том, что мне нужно использовать функцию cudadevicesynchronize () внутри функции Destry (), чтобы убедиться, что она очищает память боковой памяти устройства, прежде чем я освобожден указатель. Если это становится проблемой производительности, то я могу реинженерировать, как объекты удаляются /очищаются. br /> Мысли? < /p>

Подробнее здесь: https://stackoverflow.com/questions/794 ... using-cuda

1740001044

Anonymous

 Мне нужно использовать полиморфные классы в моем коде CUDA.  Я проанализировал несколько ответов на SO, Reddit, и у меня были бесконечные чаты с Chatgpt и Claude и я [b] [/b], я достиг решения.  Далее следует моя интерпретация ответа, данного Робертом Кровеллой, с удаленной тягой.  Мне не нужно толкать в этом конкретном контексте, и я стараюсь держать вещи как можно более простой.  Ключевое понимание здесь заключается в том, что я использую комбинацию cudamallocmanaged ()  и размещать новый  в ядро для правильного построения объекта на устройстве. 
 Насколько я могу судить, этот код работает так, как предполагается, и нет утечек памяти.  Я хотел бы, чтобы кто-то был более разбрызгивающим, чем я, убедился, что мой подход является разумным, и указывать на любые потенциальные щедрости, на которые я могу установить. < /P>
#include 
#include 

namespace Shapes {

// define a Shape class
class Shape {
public:
__device__ virtual double area() = 0;
__device__ virtual ~Shape() {
printf("deleting Shape\n");
};
};

// define a circle
class Circle : public Shape {
private:
double radius;
public:
__device__ explicit Circle(double radius) : radius(radius) {}
__device__ ~Circle() override {
printf("deleting Circle\n");
}
__device__ double area() override {
return 3.1415926 * radius * radius;
}

};

// define a rectangle
class Rectangle : public Shape {
private:
double a;
double b;
public:
__device__ explicit Rectangle(double a, double b) : a(a), b(b) {}
__device__ ~Rectangle() override {
printf("deleting Rectangle\n");
}
__device__ double area() override {
return a*b;
}

};

// define an 'initialize' kernel that uses 'placement new' to to create a shape (T) at
// the specified pointer location.
// should be called with 
template
__global__ void initialize(Shape *shape, Args... args) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx == 0) {
new(shape) T(args...);
}
}

// define a kernel to run the destructor
// should be called with 
__global__ void deleteShape(Shape *shape) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx == 0 && shape != nullptr) {
shape->~Shape();
}
}

// define a template function to malloc the shape and initialize
// note: not doing a cudaDeviceSynchronize() at this point because
// we can fire off more initializations and then wait for all to complete
template
void create(Shape * &shape, Args... args) {
cudaMallocManaged(&shape, sizeof(T));
initialize(shape, args...);
}

// the reverse of our create() function.   Calls the destructor kernel (on the device)
// and then free's the memory, setting it back to null for good measure
void destroy(Shape* &shape) {
if(shape != nullptr) {
deleteShape(shape);
cudaDeviceSynchronize();
cudaFree(shape);
shape = nullptr;
}
}

// just calculates the area of an array of shapes
__global__ void areaKernel(Shape **shape, int count, double *areas) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < count) {
areas[idx] = shape[idx]->area();
}
}

// helper function
int calculateBlocksPerGrid(int numItems, int threadsPerBlock) {
return (numItems + threadsPerBlock - 1) / threadsPerBlock;
}
}

TEST(CudaAnalyticsExamples, placementNewInit) {

using namespace Shapes;
Shape **shapes;
int count = 3;
cudaMallocManaged(&shapes, count * sizeof(Shape *));

for(int i = 0; i < count; i++) {
if(i % 2 == 0) {
// even indices get circles, odd indices get rectangles
create(shapes[i], 1.0);
} else {
create(shapes[i], 1.0, 2.0);
}
}
// wait for all initialization to complete
cudaDeviceSynchronize();

// allocate the area array
double *area;
cudaMallocManaged(&area, count*sizeof(double));

// launch the area kernel
int threadsPerBlock = 32;
int blocks = calculateBlocksPerGrid(count, threadsPerBlock);
areaKernel(shapes, count, area);
cudaDeviceSynchronize();

// print out areas
for(int i = 0; i < count; i++) {
std::cout 
Мое единственное беспокойство заключается в том, что мне нужно использовать функцию cudadevicesynchronize () внутри функции Destry (), чтобы убедиться, что она очищает память боковой памяти устройства, прежде чем я освобожден указатель.  Если это становится проблемой производительности, то я могу реинженерировать, как объекты удаляются /очищаются. br />  Мысли? < /p> 

Подробнее здесь: [url]https://stackoverflow.com/questions/79452841/is-this-a-correct-implementation-of-polymorphism-using-cuda[/url]