Ядра Ilgpu дает неправильный выход

Ядра Ilgpu дает неправильный выход ⇐ C#

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 31 янв 2025, 09:51

Я взял код ядра из образцов Ilgpu для умножения двух матриц в плиткой форме и написал программу для умножения следующих матриц:
a = |1 2 3 4|
|5 6 7 8|

| 9 10|
b = |11 12|
|13 14|
|15 16|

< /code>
Согласно Wolframalpha < /p>
{{1, 2, 3, 4}, {5, 6, 7, 8}} . {{9, 10}, {11, 12}, {13, 14}, {15, 16}}
< /code>
дает следующий вывод: < /p>
130 140
322 348
< /code>
Однако мой исходный код дает следующий вывод: < /p>
Result of matrix multiplication:
31 34
111 122

Где лежат моя ошибка, учитывая, что я использую точно одинаковое ядро и плитка? < /p>
using ILGPU;
using ILGPU.Runtime;
using System;

public static class Program
{
// Define tile size for matrix multiplication (2x2 tiles for simplicity)
// This determines how many elements will be loaded into shared memory at a time.
const int TILE_SIZE = 2;

// Kernel function to perform tiled matrix multiplication on the GPU
static void MatrixMultiplyTiledKernel(
ArrayView2D aView, // Input matrix A in GPU memory
ArrayView2D bView, // Input matrix B in GPU memory
ArrayView2D cView) // Output matrix C in GPU memory
{
// Get the global thread index in the 2D grid
Index2D global = Grid.GlobalIndex.XY;

// Get the thread indices within the thread group (local indices)
int x = Group.IdxX; // Local row index within the tile
int y = Group.IdxY; // Local column index within the tile

// Allocate shared memory for storing tiles of matrix A and matrix B
var aTile = SharedMemory.Allocate2D(
new Index2D(TILE_SIZE, TILE_SIZE),
new Stride2D.DenseX(TILE_SIZE)
);
var bTile = SharedMemory.Allocate2D(
new Index2D(TILE_SIZE, TILE_SIZE),
new Stride2D.DenseX(TILE_SIZE)
);

// Initialize the accumulation variable for the result of C[global.X, global.Y]
var sum = 0;

// Loop over the tiles of A and B matrices
// The loop increments by TILE_SIZE to process one tile at a time
for (var i = 0; i < aView.IntExtent.X; i += TILE_SIZE)
{
// Load the corresponding tile of A into shared memory
if (global.X < aView.IntExtent.X && y + i < aView.IntExtent.Y)
aTile[x, y] = aView[global.X, y + i];
else
aTile[x, y] = 0; // Pad with zeros if out of bounds

// Load the corresponding tile of B into shared memory
if (x + i < bView.IntExtent.X && global.Y < bView.IntExtent.Y)
bTile[x, y] = bView[x + i, global.Y];
else
bTile[x, y] = 0; // Pad with zeros if out of bounds

// Ensure all threads in the thread group have finished loading tiles
Group.Barrier();

// Perform computation on the tiles
for (var k = 0; k < TILE_SIZE; k++)
{
// Multiply elements of A and B tiles and accumulate the result
sum += aTile[new Index2D(x, k)] * bTile[new Index2D(k, y)];
}

// Synchronize threads before loading the next tile
Group.Barrier();
}

// Write the computed result to the output matrix C
if (global.X < cView.IntExtent.X && global.Y < cView.IntExtent.Y)
{
cView[global] = sum; // Store the result in the appropriate position
}
}

static void Main()
{
// Create an ILGPU context (manages devices and resources)
using (var context = Context.CreateDefault())
{
// Select the preferred accelerator (GPU or CPU)
using (var accelerator = context.GetPreferredDevice(preferCPU: true).CreateAccelerator(context))
{
try
{
// Initialize sample input matrices (4x4)
int[,] a = {
{ 1, 2, 3, 4},
{ 5, 6, 7, 8}
};
int[,] b = {
{ 9, 10},
{ 11, 12},
{ 13, 14},
{ 15, 16}
};

int m = a.GetLength(0);
int ka = a.GetLength(1);
int kb = b.GetLength(0);
int n = b.GetLength(1);

int[,] hostResult = new int[m, n];

// Define the group size (number of threads per tile) and number of groups
Index2D groupSize = new Index2D(TILE_SIZE, TILE_SIZE); // Threads per group (block)
Index2D numGroups = new Index2D((m + TILE_SIZE - 1) / TILE_SIZE, (n + TILE_SIZE - 1) / TILE_SIZE); // Total number of thread groups

// Allocate device memory for input matrices A, B, and output matrix C
MemoryBuffer2D aBuffer = accelerator.Allocate2DDenseX(new Index2D(m, ka));
MemoryBuffer2D bBuffer = accelerator.Allocate2DDenseX(new Index2D(kb, n));
MemoryBuffer2D cBuffer = accelerator.Allocate2DDenseX(new Index2D(m, n));

try
{
// Copy input matrices from host (CPU) to device (GPU) memory
aBuffer.CopyFromCPU(a);
bBuffer.CopyFromCPU(b);

// Load and precompile the kernel function
var loadedKernel = accelerator.LoadStreamKernel<
ArrayView2D,
ArrayView2D,
ArrayView2D>(MatrixMultiplyTiledKernel);

// Launch the kernel function on the GPU
// Specify the number of thread groups and threads per group
loadedKernel((numGroups, groupSize), aBuffer, bBuffer, cBuffer);

// Wait for the GPU to complete execution
accelerator.Synchronize();

// Retrieve the result matrix from GPU memory back to host memory
cBuffer.CopyToCPU(hostResult);

// Print the result matrix
Console.WriteLine("Result of matrix multiplication:");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
Console.Write($"{hostResult[i, j],4} ");
}
Console.WriteLine();
}
}
finally
{
// Dispose of GPU resources to free up memory
aBuffer.Dispose();
bBuffer.Dispose();
cBuffer.Dispose();
}
}
finally
{
// Dispose of the accelerator and context
accelerator.Dispose();
}
}
}

// Wait for user input before closing the console
Console.ReadLine();
}
}

Подробнее здесь: https://stackoverflow.com/questions/794 ... ect-output

1738306304

Anonymous

 Я взял код ядра из образцов Ilgpu для умножения двух матриц в  [b] плиткой форме [/b]  и написал программу для умножения следующих матриц: 
 a = |1  2  3  4|
|5  6  7  8|

| 9  10|
b = |11  12|
|13  14|
|15  16|

< /code>
Согласно Wolframalpha < /p>
{{1, 2, 3, 4}, {5, 6, 7, 8}} .  {{9, 10}, {11, 12}, {13, 14}, {15, 16}}
< /code>
дает следующий вывод: < /p>
    130   140
322   348
< /code>
Однако мой исходный код дает следующий вывод: < /p>
Result of matrix multiplication:
31   34
111  122

Где лежат моя ошибка, учитывая, что я использую точно одинаковое ядро и плитка? < /p>
using ILGPU;
using ILGPU.Runtime;
using System;

public static class Program
{
// Define tile size for matrix multiplication (2x2 tiles for simplicity)
// This determines how many elements will be loaded into shared memory at a time.
const int TILE_SIZE = 2;

// Kernel function to perform tiled matrix multiplication on the GPU
static void MatrixMultiplyTiledKernel(
ArrayView2D aView, // Input matrix A in GPU memory
ArrayView2D bView, // Input matrix B in GPU memory
ArrayView2D cView) // Output matrix C in GPU memory
{
// Get the global thread index in the 2D grid
Index2D global = Grid.GlobalIndex.XY;

// Get the thread indices within the thread group (local indices)
int x = Group.IdxX; // Local row index within the tile
int y = Group.IdxY; // Local column index within the tile

// Allocate shared memory for storing tiles of matrix A and matrix B
var aTile = SharedMemory.Allocate2D(
new Index2D(TILE_SIZE, TILE_SIZE),
new Stride2D.DenseX(TILE_SIZE)
);
var bTile = SharedMemory.Allocate2D(
new Index2D(TILE_SIZE, TILE_SIZE),
new Stride2D.DenseX(TILE_SIZE)
);

// Initialize the accumulation variable for the result of C[global.X, global.Y]
var sum = 0;

// Loop over the tiles of A and B matrices
// The loop increments by TILE_SIZE to process one tile at a time
for (var i = 0; i < aView.IntExtent.X; i += TILE_SIZE)
{
// Load the corresponding tile of A into shared memory
if (global.X < aView.IntExtent.X && y + i < aView.IntExtent.Y)
aTile[x, y] = aView[global.X, y + i];
else
aTile[x, y] = 0; // Pad with zeros if out of bounds

// Load the corresponding tile of B into shared memory
if (x + i < bView.IntExtent.X && global.Y < bView.IntExtent.Y)
bTile[x, y] = bView[x + i, global.Y];
else
bTile[x, y] = 0; // Pad with zeros if out of bounds

// Ensure all threads in the thread group have finished loading tiles
Group.Barrier();

// Perform computation on the tiles
for (var k = 0; k < TILE_SIZE; k++)
{
// Multiply elements of A and B tiles and accumulate the result
sum += aTile[new Index2D(x, k)] * bTile[new Index2D(k, y)];
}

// Synchronize threads before loading the next tile
Group.Barrier();
}

// Write the computed result to the output matrix C
if (global.X < cView.IntExtent.X && global.Y < cView.IntExtent.Y)
{
cView[global] = sum;  // Store the result in the appropriate position
}
}

static void Main()
{
// Create an ILGPU context (manages devices and resources)
using (var context = Context.CreateDefault())
{
// Select the preferred accelerator (GPU or CPU)
using (var accelerator = context.GetPreferredDevice(preferCPU: true).CreateAccelerator(context))
{
try
{
// Initialize sample input matrices (4x4)
int[,] a = {
{ 1, 2, 3, 4},
{ 5, 6, 7, 8}
};
int[,] b = {
{ 9, 10},
{ 11, 12},
{ 13, 14},
{ 15, 16}
};

int m = a.GetLength(0);
int ka = a.GetLength(1);
int kb = b.GetLength(0);
int n = b.GetLength(1);

int[,] hostResult = new int[m, n];

// Define the group size (number of threads per tile) and number of groups
Index2D groupSize = new Index2D(TILE_SIZE, TILE_SIZE); // Threads per group (block)
Index2D numGroups = new Index2D((m + TILE_SIZE - 1) / TILE_SIZE, (n + TILE_SIZE - 1) / TILE_SIZE); // Total number of thread groups

// Allocate device memory for input matrices A, B, and output matrix C
MemoryBuffer2D aBuffer = accelerator.Allocate2DDenseX(new Index2D(m, ka));
MemoryBuffer2D bBuffer = accelerator.Allocate2DDenseX(new Index2D(kb, n));
MemoryBuffer2D cBuffer = accelerator.Allocate2DDenseX(new Index2D(m, n));

try
{
// Copy input matrices from host (CPU) to device (GPU) memory
aBuffer.CopyFromCPU(a);
bBuffer.CopyFromCPU(b);

// Load and precompile the kernel function
var loadedKernel = accelerator.LoadStreamKernel<
ArrayView2D,
ArrayView2D,
ArrayView2D>(MatrixMultiplyTiledKernel);

// Launch the kernel function on the GPU
// Specify the number of thread groups and threads per group
loadedKernel((numGroups, groupSize), aBuffer, bBuffer, cBuffer);

// Wait for the GPU to complete execution
accelerator.Synchronize();

// Retrieve the result matrix from GPU memory back to host memory
cBuffer.CopyToCPU(hostResult);

// Print the result matrix
Console.WriteLine("Result of matrix multiplication:");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
Console.Write($"{hostResult[i, j],4} ");
}
Console.WriteLine();
}
}
finally
{
// Dispose of GPU resources to free up memory
aBuffer.Dispose();
bBuffer.Dispose();
cBuffer.Dispose();
}
}
finally
{
// Dispose of the accelerator and context
accelerator.Dispose();
}
}
}

// Wait for user input before closing the console
Console.ReadLine();
}
}

 

Подробнее здесь: [url]https://stackoverflow.com/questions/79401972/ilgpu-kernel-giving-incorrect-output[/url]