Я готовлю метод для обработки некоторых комбинаций для решателя Сокобана. Что касается Discovery Discovery, я преобразую его для запуска на ядрах cuda.
много уже кажется хорошим, за исключением исключения, которое я получаю при запуска на Cudaaccelerator, предоставленном Ilgpu, который исчезает, как только я тестирую код на Cpuaccelerator.
convertedLayout.View[r, c] = rValue; // 0)
{
newDiscoveries = true;
discoveredDeadlocks = newQueue;
totalMultiPassDeadlocks += newQueue.Count;
UpdateGpuOrderProgress(i, DiscoveryPhase.MultiPass, 0, newQueue.Count);
passNumber++;
_logger.Information("D-Deadlocks: Order {Order} pass {Pass} found {NewDeadlocks} additional deadlocks",
i, passNumber - 1, newQueue.Count);
}
else
{
newDiscoveries = false;
}
}
// Mark multi-pass as completed
UpdateGpuOrderProgress(i, DiscoveryPhase.MultiPass, true);
var allDeadlocksForOrder = new List();
// Collect all deadlocks from consolidatedDeadlocks queue
while (consolidatedDeadlocks.TryDequeue(out var deadlock))
{
allDeadlocksForOrder.Add(deadlock);
}
// Add any remaining deadlocks from discoveredDeadlocks
while (discoveredDeadlocks.TryDequeue(out var deadlock))
{
allDeadlocksForOrder.Add(deadlock);
}
// Store in permanent collection with proper ordering
_deadlocks[i] = allDeadlocksForOrder
.DistinctBy(ps => ps.ToString()) // Remove duplicates based on full string representation
.OrderBy(ps => ps.ToDiamondString()) // Order by diamond string representation
.ToArray();
orderStopwatch.Stop();
var deadlockCount = _deadlocks[i].Length;
_logger.Information("D-Deadlocks: Completed order {Order} in {ElapsedTime:0.000}s. Found {DeadlockCount:N0} deadlocks total (First Pass: {FirstPass}, Multi-pass: {MultiPass})",
i, orderStopwatch.Elapsed.TotalSeconds, deadlockCount,
Math.Max(0, deadlockCount - totalMultiPassDeadlocks), totalMultiPassDeadlocks);
}
await loggingTimer.DisposeAsync();
_basicAlgoCompleted = true;
IsCompleted = true;
IsGenerating = false;
fullRunChrono.Stop();
var totalDeadlocks = _deadlocks.Values.Sum(arr => arr.Length);
var elapsedTime = fullRunChrono.Elapsed.TotalSeconds;
_logger.Information("D-Deadlocks: General map discovery completed. Total deadlocks found: {Total:N0} in {ElapsedTime:0.000} seconds", totalDeadlocks, elapsedTime);
LogAllDeadlocks(_deadlocks);
return (finished: true, deadlocks: GetAllPotentialTiles(Deadlocks));
}
< /code>
Насколько у меня есть два разных ядра, работающих последовательно один за другим, я возвращаю буферу для памяти, обработанную в первом ядре (комбинированное генерация), чтобы быть готовым под рукой во втором (истинная часть обнаружения Decolck) < /p>
private Task ConversionWorkAsync(Accelerator accelerator, int[] convertedAdmissibleTiles, int[][] chunks, int conversionRatio, int j, TileContent[,] layout)
{
// EARLY EXIT: Prevent invalid kernel launch for empty chunk
if (chunks[j] == null || chunks[j].Length == 0)
{
_logger.Warning($"ConversionWorkAsync: Skipping empty chunk at index {j}.");
return Task.FromResult(accelerator.Allocate1D(0));
}
// ADDITIONAL GUARD: Check accelerator validity and group size
if (accelerator == null || accelerator.IsDisposed)
{
_logger.Error($"ConversionWorkAsync: Accelerator is null or disposed at chunk index {j}. Skipping kernel launch.");
return Task.FromResult(accelerator.Allocate1D(0));
}
if (accelerator.MaxNumThreadsPerGroup == 0)
{
_logger.Error($"ConversionWorkAsync: Accelerator reports MaxNumThreadsPerGroup == 0 at chunk index {j}. Skipping kernel launch.");
return Task.FromResult(accelerator.Allocate1D(0));
}
// Log buffer sizes
_logger.Debug($"ConversionWorkAsync: chunk[{j}].Length={chunks[j].Length}, conversionRatio={conversionRatio}, layout=({layout.GetLength(0)},{layout.GetLength(1)})");
MemoryBuffer1D chunkOnDevice = accelerator.Allocate1D(chunks[j]);
Index2D dims = new Index2D(layout.GetLength(0), layout.GetLength(1));
var requiredMemory = dims.Size * sizeof(ushort);
if (requiredMemory > accelerator.MemorySize / 10)
{
_logger.Warning("ConversionWork: Large memory allocation requested: {RequiredMB}MB", requiredMemory / (1024 * 1024));
}
MemoryBuffer2D convertedLayout;
try
{
convertedLayout = accelerator.Allocate2D(
dims,
extent => extent.Y,
(extent, leadingDimension) => Stride2D.DenseY.FromExtent(extent)
);
}
catch (Exception ex)
{
_logger.Error(ex, "Failed to allocate GPU memory for layout conversion. Dims: {Dims}", dims);
chunkOnDevice.Dispose();
return Task.FromResult(accelerator.Allocate1D(0));
}
int errorR = -1, errorC = -1;
try
{
for (int r = 0; r < dims.X; r++)
{
for (int c = 0; c < dims.Y; c++)
{
errorR = r;
errorC = c;
if (r >= 0 && r < layout.GetLength(0) && c >= 0 && c < layout.GetLength(1))
{
var rValue = (ushort)layout[r, c];
convertedLayout.View[r, c] = rValue; // = data.Length) break;
int content = data[i + shift];
if (combDataOffset + shift < combData.Length)
combData[combDataOffset + shift] = content;
int contentCol = content % 256;
int contentRow = (content - contentCol) / 256;
int up = ((contentRow - 1) * 256) + contentCol;
if (IsAdmissible(up, admissibleTiles) && localGenerated < multFactor)
{
if (manPositionsOffset + localGenerated < manPositions.Length)
manPositions[manPositionsOffset + localGenerated] = up;
localGenerated++;
}
int down = ((contentRow + 1) * 256) + contentCol;
if (IsAdmissible(down, admissibleTiles) && localGenerated < multFactor)
{
if (manPositionsOffset + localGenerated < manPositions.Length)
manPositions[manPositionsOffset + localGenerated] = down;
localGenerated++;
}
int right = content + 1;
if (IsAdmissible(right, admissibleTiles) && localGenerated < multFactor)
{
if (manPositionsOffset + localGenerated < manPositions.Length)
manPositions[manPositionsOffset + localGenerated] = right;
localGenerated++;
}
int left = content - 1;
if (IsAdmissible(left, admissibleTiles) && localGenerated < multFactor)
{
if (manPositionsOffset + localGenerated < manPositions.Length)
manPositions[manPositionsOffset + localGenerated] = left;
localGenerated++;
}
}
// Defensive: check subview lengths
if (manPositionsOffset + localGenerated > manPositions.Length) return;
if (combDataOffset + order > combData.Length) return;
if (cleanupOffset + localGenerated > cleanup.Length) return;
var threadManPositions = manPositions.SubView(manPositionsOffset, localGenerated);
var threadCombData = combData.SubView(combDataOffset, order);
var threadCleanup = cleanup.SubView(cleanupOffset, localGenerated);
CleanupEquivalentManPositionsGpu(layout, threadCombData, threadManPositions, threadCleanup, visited, queue);
int pStateLen = (1 + order);
int targetIndex = i * (multFactor * pStateLen);
for (int j = 0; j < localGenerated; j++)
{
if (targetIndex + (j * pStateLen) >= output.Length) break;
if (j >= threadCleanup.Length) break;
output[targetIndex + (j * pStateLen)] = threadCleanup[j];
for (int k = 1; k = output.Length) break;
if ((k - 1) >= threadCombData.Length) break;
output[targetIndex + (j * pStateLen) + k] = threadCombData[k - 1];
}
}
}
< /code>
Чтобы быть уверенным в том, чтобы все включить все, это обертка для ускорителей, предоставленных Ilgpu: < /p>
///
/// Contains all information related to GPU usage
///
public class CudaWrapper : IDisposable
{
public CLAccelerator? ClAccelerator { get; set; }
public long ClMaxMemory { get; set; }
public int ClMaxThreads { get; set; }
///
/// Gets or sets the context of this instance
///
///
/// The context.
///
public Context? Context { get; set; }
///
/// A fallback device in case Cuda may fail.
///
///
/// The cpu accelerator.
///
public CPUAccelerator? CpuAccelerator { get; set; }
///
/// The GPU
///
///
/// The gpu accelerator.
///
public CudaAccelerator? GpuAccelerator { get; set; }
public long GpuMaxMemory { get; set; }
public int GpuMaxThreads { get; set; }
public CudaWrapper(ILogger logger)
{
_logger = logger.ForContext();
}
public void Dispose()
{
// Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
public Task Initialize()
{
try
{
Context = Context.Create(builder => builder.AllAccelerators());
var devices = Context.GetCudaDevices();
if (devices.Count == 0)
{
_logger.Warning("No Cuda capable GPU found in the system. Failing over to CPU model accelerator");
}
GpuMaxThreads = 0;
GpuMaxMemory = 0;
for (var index = 0; index < devices.Count; index++)
{
_logger.Information("Listing GPUs found:");
var device = devices[index];
device.PrintInformation(new TextWriterLogger(_logger));
var gpuAccelerator = device.CreateCudaAccelerator(Context);
var mem = gpuAccelerator.MemorySize;
var th = gpuAccelerator.MaxNumThreads;
if (mem > GpuMaxMemory || th > GpuMaxThreads)
{
GpuMaxThreads = th;
GpuMaxMemory = mem;
GpuAccelerator = gpuAccelerator;
_logger.Information("Set {Device} as preferred Cuda device", GpuAccelerator.Name);
}
}
// now set the intel graphic card
var clDevices = Context.GetCLDevices();
for (var index = 0; index < clDevices.Count; index++)
{
_logger.Information("Listing OpenCl GPUs found:");
var device = clDevices[index];
device.PrintInformation(new TextWriterLogger(_logger));
var clAccelerator = device.CreateCLAccelerator(Context);
var mem = clAccelerator.MemorySize;
var th = clAccelerator.MaxNumThreads;
if (mem > GpuMaxMemory || th > GpuMaxThreads)
{
ClMaxThreads = th;
ClMaxMemory = mem;
ClAccelerator = clAccelerator;
_logger.Information("Set {Device} as preferred OpenCL device", ClAccelerator.Name);
}
}
//setting the CPU fallback
CpuAccelerator = Context.GetCPUDevice(0).CreateCPUAccelerator(Context);
_logger.Information("Set {Device} as preferred fallback device", CpuAccelerator.Name);
CpuAccelerator.PrintInformation();
}
catch (Exception ex)
{
_logger.Warning(ex, "GPU: Failed to initialize GPU acceleration, trying CPU acceleration");
}
return Task.CompletedTask;
}
private void Dispose(bool disposing)
{
if (!_disposedValue)
{
if (disposing)
{
CpuAccelerator?.Dispose();
GpuAccelerator?.Dispose();
ClAccelerator?.Dispose();
// dispose managed state (managed objects)
Context?.Dispose();
}
// free unmanaged resources (unmanaged objects) and override finalizer set large
// fields to null
_disposedValue = true;
}
}
private readonly ILogger _logger;
private bool _disposedValue;
}
< /code>
и, наконец, исключение, выброшенное выше: < /p>
Fatal error. System.AccessViolationException: Attempted to read or write protected memory. This is often an indication that other memory is corrupt.
< /code>
at sokolib.domain unkingledge.dynamicdeadlocks.conversionworkasync (ilgpu.runtime.ccelerator, int32 [], int32 [] [], int32, int32, sokolib.core.tilecontent [,]) Sokolib.domainknowledge.dynamicdeadlocks+ d__131+AsyncStateMachineBox
1 [[system.valuetuple`2 [[[System.boolean, System.private.corelib, версия = 9.0.0.0, культура = нейтральная, publickeytoken = 7cec85d7bea7798e], [System .__ Canon, System.private.corelib, версия = 9.0.0. PublickeyToken = 7cec85d7bea7798e]], system.private.corelib, версия = 9.0.0.0, культура = нейтральная, publickeytoken = 7cec85d7bea7798e], [Система .__ Канон, System.private.corelib, версия = 9.0.0, культура = нейтральная PublickeyToken = 7cec85d7bea7798e]]. ExecutionContextCallback (System.Object) < /p>
Я создал переменную rValue, чтобы отделить оценку двух массива, чтобы быть уверенным, какая из них создает проблему. Похоже, что он находится в назначении ConvertedLayout. < /P>
Любые идеи? /> Алекс < /p>
Я готовлю метод для обработки некоторых комбинаций для решателя Сокобана. Что касается Discovery Discovery, я преобразую его для запуска на ядрах cuda. много уже кажется хорошим, за исключением исключения, которое я получаю при запуска на Cudaaccelerator, предоставленном Ilgpu, который исчезает, как только я тестирую код на Cpuaccelerator.[code]convertedLayout.View[r, c] = rValue; // 0) { newDiscoveries = true; discoveredDeadlocks = newQueue; totalMultiPassDeadlocks += newQueue.Count; UpdateGpuOrderProgress(i, DiscoveryPhase.MultiPass, 0, newQueue.Count); passNumber++;
_logger.Information("D-Deadlocks: Order {Order} pass {Pass} found {NewDeadlocks} additional deadlocks", i, passNumber - 1, newQueue.Count); } else { newDiscoveries = false; } }
// Mark multi-pass as completed UpdateGpuOrderProgress(i, DiscoveryPhase.MultiPass, true);
var allDeadlocksForOrder = new List();
// Collect all deadlocks from consolidatedDeadlocks queue while (consolidatedDeadlocks.TryDequeue(out var deadlock)) { allDeadlocksForOrder.Add(deadlock); }
// Add any remaining deadlocks from discoveredDeadlocks while (discoveredDeadlocks.TryDequeue(out var deadlock)) { allDeadlocksForOrder.Add(deadlock); }
// Store in permanent collection with proper ordering _deadlocks[i] = allDeadlocksForOrder .DistinctBy(ps => ps.ToString()) // Remove duplicates based on full string representation .OrderBy(ps => ps.ToDiamondString()) // Order by diamond string representation .ToArray();
orderStopwatch.Stop(); var deadlockCount = _deadlocks[i].Length; _logger.Information("D-Deadlocks: Completed order {Order} in {ElapsedTime:0.000}s. Found {DeadlockCount:N0} deadlocks total (First Pass: {FirstPass}, Multi-pass: {MultiPass})", i, orderStopwatch.Elapsed.TotalSeconds, deadlockCount, Math.Max(0, deadlockCount - totalMultiPassDeadlocks), totalMultiPassDeadlocks); }
fullRunChrono.Stop(); var totalDeadlocks = _deadlocks.Values.Sum(arr => arr.Length); var elapsedTime = fullRunChrono.Elapsed.TotalSeconds; _logger.Information("D-Deadlocks: General map discovery completed. Total deadlocks found: {Total:N0} in {ElapsedTime:0.000} seconds", totalDeadlocks, elapsedTime);
LogAllDeadlocks(_deadlocks);
return (finished: true, deadlocks: GetAllPotentialTiles(Deadlocks)); } < /code> Насколько у меня есть два разных ядра, работающих последовательно один за другим, я возвращаю буферу для памяти, обработанную в первом ядре (комбинированное генерация), чтобы быть готовым под рукой во втором (истинная часть обнаружения Decolck) < /p> private Task ConversionWorkAsync(Accelerator accelerator, int[] convertedAdmissibleTiles, int[][] chunks, int conversionRatio, int j, TileContent[,] layout) { // EARLY EXIT: Prevent invalid kernel launch for empty chunk if (chunks[j] == null || chunks[j].Length == 0) { _logger.Warning($"ConversionWorkAsync: Skipping empty chunk at index {j}."); return Task.FromResult(accelerator.Allocate1D(0)); }
// ADDITIONAL GUARD: Check accelerator validity and group size if (accelerator == null || accelerator.IsDisposed) { _logger.Error($"ConversionWorkAsync: Accelerator is null or disposed at chunk index {j}. Skipping kernel launch."); return Task.FromResult(accelerator.Allocate1D(0)); } if (accelerator.MaxNumThreadsPerGroup == 0) { _logger.Error($"ConversionWorkAsync: Accelerator reports MaxNumThreadsPerGroup == 0 at chunk index {j}. Skipping kernel launch."); return Task.FromResult(accelerator.Allocate1D(0)); }
MemoryBuffer1D chunkOnDevice = accelerator.Allocate1D(chunks[j]); Index2D dims = new Index2D(layout.GetLength(0), layout.GetLength(1));
var requiredMemory = dims.Size * sizeof(ushort); if (requiredMemory > accelerator.MemorySize / 10) { _logger.Warning("ConversionWork: Large memory allocation requested: {RequiredMB}MB", requiredMemory / (1024 * 1024)); } MemoryBuffer2D convertedLayout; try { convertedLayout = accelerator.Allocate2D( dims, extent => extent.Y, (extent, leadingDimension) => Stride2D.DenseY.FromExtent(extent) ); } catch (Exception ex) { _logger.Error(ex, "Failed to allocate GPU memory for layout conversion. Dims: {Dims}", dims); chunkOnDevice.Dispose(); return Task.FromResult(accelerator.Allocate1D(0)); } int errorR = -1, errorC = -1; try { for (int r = 0; r < dims.X; r++) { for (int c = 0; c < dims.Y; c++) { errorR = r; errorC = c; if (r >= 0 && r < layout.GetLength(0) && c >= 0 && c < layout.GetLength(1)) { var rValue = (ushort)layout[r, c]; convertedLayout.View[r, c] = rValue; // = data.Length) break; int content = data[i + shift]; if (combDataOffset + shift < combData.Length) combData[combDataOffset + shift] = content; int contentCol = content % 256; int contentRow = (content - contentCol) / 256;
int up = ((contentRow - 1) * 256) + contentCol; if (IsAdmissible(up, admissibleTiles) && localGenerated < multFactor) { if (manPositionsOffset + localGenerated < manPositions.Length) manPositions[manPositionsOffset + localGenerated] = up; localGenerated++; } int down = ((contentRow + 1) * 256) + contentCol; if (IsAdmissible(down, admissibleTiles) && localGenerated < multFactor) { if (manPositionsOffset + localGenerated < manPositions.Length) manPositions[manPositionsOffset + localGenerated] = down; localGenerated++; } int right = content + 1; if (IsAdmissible(right, admissibleTiles) && localGenerated < multFactor) { if (manPositionsOffset + localGenerated < manPositions.Length) manPositions[manPositionsOffset + localGenerated] = right; localGenerated++; } int left = content - 1; if (IsAdmissible(left, admissibleTiles) && localGenerated < multFactor) { if (manPositionsOffset + localGenerated < manPositions.Length) manPositions[manPositionsOffset + localGenerated] = left; localGenerated++; } }
// Defensive: check subview lengths if (manPositionsOffset + localGenerated > manPositions.Length) return; if (combDataOffset + order > combData.Length) return; if (cleanupOffset + localGenerated > cleanup.Length) return;
var threadManPositions = manPositions.SubView(manPositionsOffset, localGenerated); var threadCombData = combData.SubView(combDataOffset, order); var threadCleanup = cleanup.SubView(cleanupOffset, localGenerated);
int pStateLen = (1 + order); int targetIndex = i * (multFactor * pStateLen); for (int j = 0; j < localGenerated; j++) { if (targetIndex + (j * pStateLen) >= output.Length) break; if (j >= threadCleanup.Length) break; output[targetIndex + (j * pStateLen)] = threadCleanup[j]; for (int k = 1; k = output.Length) break; if ((k - 1) >= threadCombData.Length) break; output[targetIndex + (j * pStateLen) + k] = threadCombData[k - 1]; } } } < /code> Чтобы быть уверенным в том, чтобы все включить все, это обертка для ускорителей, предоставленных Ilgpu: < /p> /// /// Contains all information related to GPU usage /// public class CudaWrapper : IDisposable { public CLAccelerator? ClAccelerator { get; set; }
public long ClMaxMemory { get; set; }
public int ClMaxThreads { get; set; }
/// /// Gets or sets the context of this instance /// /// /// The context. /// public Context? Context { get; set; }
/// /// A fallback device in case Cuda may fail. /// /// /// The cpu accelerator. /// public CPUAccelerator? CpuAccelerator { get; set; }
/// /// The GPU /// /// /// The gpu accelerator. /// public CudaAccelerator? GpuAccelerator { get; set; }
public long GpuMaxMemory { get; set; }
public int GpuMaxThreads { get; set; }
public CudaWrapper(ILogger logger) { _logger = logger.ForContext(); }
public void Dispose() { // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
public Task Initialize() { try { Context = Context.Create(builder => builder.AllAccelerators()); var devices = Context.GetCudaDevices(); if (devices.Count == 0) { _logger.Warning("No Cuda capable GPU found in the system. Failing over to CPU model accelerator"); } GpuMaxThreads = 0; GpuMaxMemory = 0; for (var index = 0; index < devices.Count; index++) { _logger.Information("Listing GPUs found:"); var device = devices[index]; device.PrintInformation(new TextWriterLogger(_logger)); var gpuAccelerator = device.CreateCudaAccelerator(Context); var mem = gpuAccelerator.MemorySize; var th = gpuAccelerator.MaxNumThreads; if (mem > GpuMaxMemory || th > GpuMaxThreads) { GpuMaxThreads = th; GpuMaxMemory = mem; GpuAccelerator = gpuAccelerator; _logger.Information("Set {Device} as preferred Cuda device", GpuAccelerator.Name); } }
// now set the intel graphic card var clDevices = Context.GetCLDevices(); for (var index = 0; index < clDevices.Count; index++) { _logger.Information("Listing OpenCl GPUs found:"); var device = clDevices[index]; device.PrintInformation(new TextWriterLogger(_logger)); var clAccelerator = device.CreateCLAccelerator(Context); var mem = clAccelerator.MemorySize; var th = clAccelerator.MaxNumThreads; if (mem > GpuMaxMemory || th > GpuMaxThreads) { ClMaxThreads = th; ClMaxMemory = mem; ClAccelerator = clAccelerator; _logger.Information("Set {Device} as preferred OpenCL device", ClAccelerator.Name); } }
//setting the CPU fallback CpuAccelerator = Context.GetCPUDevice(0).CreateCPUAccelerator(Context); _logger.Information("Set {Device} as preferred fallback device", CpuAccelerator.Name); CpuAccelerator.PrintInformation(); } catch (Exception ex) { _logger.Warning(ex, "GPU: Failed to initialize GPU acceleration, trying CPU acceleration"); }
return Task.CompletedTask; }
private void Dispose(bool disposing) { if (!_disposedValue) { if (disposing) { CpuAccelerator?.Dispose(); GpuAccelerator?.Dispose(); ClAccelerator?.Dispose(); // dispose managed state (managed objects) Context?.Dispose(); }
// free unmanaged resources (unmanaged objects) and override finalizer set large // fields to null _disposedValue = true; } }
private readonly ILogger _logger; private bool _disposedValue; } < /code> и, наконец, исключение, выброшенное выше: < /p> Fatal error. System.AccessViolationException: Attempted to read or write protected memory. This is often an indication that other memory is corrupt. < /code> at sokolib.domain unkingledge.dynamicdeadlocks.conversionworkasync (ilgpu.runtime.ccelerator, int32 [], int32 [] [], int32, int32, sokolib.core.tilecontent [,]) Sokolib.domainknowledge.dynamicdeadlocks+ d__131+AsyncStateMachineBox[/code] 1 [[system.valuetuple`2 [[[System.boolean, System.private.corelib, версия = 9.0.0.0, культура = нейтральная, publickeytoken = 7cec85d7bea7798e], [System .__ Canon, System.private.corelib, версия = 9.0.0. PublickeyToken = 7cec85d7bea7798e]], system.private.corelib, версия = 9.0.0.0, культура = нейтральная, publickeytoken = 7cec85d7bea7798e], [Система .__ Канон, System.private.corelib, версия = 9.0.0, культура = нейтральная PublickeyToken = 7cec85d7bea7798e]]. ExecutionContextCallback (System.Object) < /p> Я создал переменную rValue, чтобы отделить оценку двух массива, чтобы быть уверенным, какая из них создает проблему. Похоже, что он находится в назначении ConvertedLayout. < /P> Любые идеи? /> Алекс < /p>