Example scenario , i have a following page
[*]Line 1
[*]Line 2
[*]Table ( with just header x and y )
[*]Line 3
[*]Line 4
< /ul>
В моем выходе я получаю это < /p>
Line 1 Содержимое < /li>
Содержание строки 2 < /li>
Содержимое заголовка таблиц (x и y) (как текст не должен прийти) < /li>
ряд таблиц (x и y) (как не должно быть) < /li>
. ) < /li>
Таблица (как таблица, которая должна поступать с тегами, которые я добавил в код) < /li>
Строка 4 < /li>
строка 5 < /li>
< /ul>
Вот код < /p>
< /ul>
.
Код: Выделить всё
[HttpPost("read-pdf-iron")]
public async Task ReadPdf([FromBody] PdfFilePathRequest request)
{
try
{
if (string.IsNullOrWhiteSpace(request.PdfFilePath))
{
return new StandardResponse
{
Status = false,
Message = "PDF file path cannot be empty."
};
}
// Initialize IronTesseract with more precise configuration
var ocrTesseract = new IronTesseract();
// Configure OCR with more granular settings
ocrTesseract.Configuration.ReadDataTables = true;
ocrTesseract.Configuration.WhiteListCharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 .,()-:;'\"";
// Create OCR input
using var ocrInput = new IronOcr.OcrInput();
// Load PDF from file path
ocrInput.LoadPdf(request.PdfFilePath);
// Perform OCR
var ocrResult = await ocrTesseract.ReadAsync(ocrInput);
// Advanced document reading
var advancedResult = ocrTesseract.ReadDocumentAdvanced(ocrInput);
// StringBuilder to store the extracted content
StringBuilder sb = new StringBuilder();
// Counter for tracking tables and other elements
int tableCount = 0;
int textBlockCount = 0;
// Process all pages
for (int pageIndex = 0; pageIndex < ocrResult.PageCount; pageIndex++)
{
sb.AppendLine($"
");
// Extract tables for the page
var pageTables = advancedResult.Tables.Where(t => t.Page == pageIndex + 1).ToList();
// Extract full page text
var pageText = ocrResult.Pages[pageIndex].Text;
// Split text into lines
var textLines = pageText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
// Separate text before, between, and after tables
var textBlocks = new List();
var currentTextBlock = new List();
foreach (var line in textLines)
{
// Check if the line is part of any table
bool isTableLine = pageTables.Any(table =>
table.CellInfos.Any(cell =>
cell.CellText.Contains(line.Trim(), StringComparison.OrdinalIgnoreCase)));
if (!isTableLine)
{
currentTextBlock.Add(line);
}
else
{
// If the current text block is not empty, add it to the text blocks list
if (currentTextBlock.Any())
{
textBlocks.Add(string.Join("\n", currentTextBlock).Trim());
currentTextBlock.Clear();
}
}
}
// Add the last text block if it exists
if (currentTextBlock.Any())
{
textBlocks.Add(string.Join("\n", currentTextBlock).Trim());
}
// Write text blocks and tables in the correct order
int tableIndex = 0;
int textBlockIndex = 0;
while (tableIndex < pageTables.Count || textBlockIndex < textBlocks.Count)
{
// Write text blocks before the first table or between tables
if (textBlockIndex < textBlocks.Count)
{
textBlockCount++;
sb.AppendLine($"{textBlocks[textBlockIndex]}");
textBlockIndex++;
}
// Write tables
if (tableIndex < pageTables.Count)
{
var table = pageTables[tableIndex];
tableCount++;
sb.Append($"");
foreach (var cell in table.CellInfos)
{
sb.Append(cell.CellText.Trim());
sb.Append('|');
}
sb.Append("");
sb.AppendLine();
tableIndex++;
}
}
sb.AppendLine("");
}
return new StandardResponse
{
Status = true,
Message = $"PDF content read successfully. Found {tableCount} tables and {textBlockCount} text blocks.",
Data = sb.ToString()
};
}
catch (Exception ex)
{
_logger.LogError(ex, "Error reading PDF file");
return new StandardResponse
{
Status = false,
Message = ex.Message
};
}
}
Это выход
G.
Employment: For each employer during last 5 years, please state:
Name of Employer City Start and End Date Occupation
Name of Employer|City|Start and End Date
of Employment|Occupation|RKS Construction|Pasadena|2005 to Present|Carpenter|||||||||||||||||||||||||
RKS Construction Pasadena 2005 to Present Carpenter
H. Other Claims(Litigation
1. Have you ever been a party to a lawsuit other than in the present lawsuit, seeking
civilmonetary damages YES L NO
If YES, identify the following as to each:
Caption Case
Caption Case
No.:|Date Filed|CityState
of Court|Nature of
Action|Outcome|Your Lawyer's
Name Address|NIA|N|NA|NA|NA|A|||||||||||||
Your Lawyers
Name Address
NJA
NJA
NJA
NJA NJA
NJA
< /code>
Как мы можем видеть здесь, заголовки таблицы (имя работодателя города начала и дата окончания) также ввели текстовый блок. Я не могу найти подход здесь, как я могу справиться с этим.
Подробнее здесь: https://stackoverflow.com/questions/794 ... -page-text