Извлечь текст Bangla из изображения

Извлечь текст Bangla из изображения ⇐ C#

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 20 фев 2026, 20:16

var ocr = new IronTesseract();
ocr.Language = OcrLanguage.Bengali;

// Optimization for high-density forms (18 boxes)
ocr.Configuration.ReadBarCodes = false;
ocr.Configuration.PageSegmentationMode = TesseractPageSegmentationMode.Auto;

using(FolderBrowserDialog fbd = new FolderBrowserDialog()) {
if (fbd.ShowDialog() == DialogResult.OK) {
string inputFolder = fbd.SelectedPath;
string outputFile = Path.Combine(inputFolder, "VoterData_18Box_Export.csv");

try {
string[] files = Directory.GetFiles(inputFolder, "*.jpg");
int totalRecords = 0;

using(var writer = new StreamWriter(outputFile, false, Encoding.UTF8)) {
// Writing CSV Header
writer.WriteLine("নাম,ভোটার নং,পিতা,মাতা,পেশা,ঠিকানা");

foreach(string filePath in files) {
using(var input = new OcrInput(filePath)) {
// Image optimization for dense text
input.DeNoise();
input.Contrast();

var result = ocr.Read(input);
string fullText = result.Text;

// SPLIT LOGIC: 18 boxes usually means 18 "নাম:" labels
// We split by the label to isolate each person's box
//  string[] cardBlocks = fullText.Split(new[] { "নাম:" }, StringSplitOptions.RemoveEmptyEntries);
//  string[] cardBlocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline);
var blocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline).Where(b =>!string.IsNullOrWhiteSpace(b)).ToList();

//foreach (string block in cardBlocks)
foreach(string block in blocks) {
// Skip noise/empty blocks
if (!block.Contains("ভোটার নং:") && !block.Contains("পিতা:")) continue;

// 1. Extract Name (It's the text immediately after "নাম:" until the end of that line)
string name = GetFirstLine(block);

// 2. Extract other fields from the remaining block text
string voterId = GetValue(block, "ভোটার নং:");
string father = GetValue(block, "পিতা:");
string mother = GetValue(block, "মাতা:");
string job = GetValue(block, "পেশা:");
string address = GetValue(block, "ঠিকানা:");

// 3. Write to CSV
string line = $ "{EscapeCsv(name)},{EscapeCsv(voterId)},{EscapeCsv(father)},{EscapeCsv(mother)},{EscapeCsv(job)},{EscapeCsv(address)}";
writer.WriteLine(line);
totalRecords++;
}
writer.Flush();
}
rtbStatus.AppendText($ "Processed: {Path.GetFileName(filePath)} (Found {totalRecords} total)\n");
}
}
MessageBox.Show($ "সম্পন্ন হয়েছে!\nমোট ছবি: {files.Length}\nমোট ভোটার: {totalRecords}");
}
catch(Exception ex) {
MessageBox.Show("Error: " + ex.Message);
}
}
}

** На моем изображении всего 18 блоков с 6 строками и 3 столбцами.

** Мне нужно читать каждую строку слева направо.

** OCR читается слева направо в каждой строке, но иногда OCR прыгает сверху вниз по строке. Пример: OCR прочитал последний блок из 3 строк без строк, он должен перейти к 4 без строк и читать слева направо, но его переход к последнему блоку 4 без строк, поэтому первые два блока являются экранированием.

Подробнее здесь: https://stackoverflow.com/questions/798 ... from-image

1771607782

Anonymous

[code]var ocr = new IronTesseract();
ocr.Language = OcrLanguage.Bengali;

// Optimization for high-density forms (18 boxes)
ocr.Configuration.ReadBarCodes = false;
ocr.Configuration.PageSegmentationMode = TesseractPageSegmentationMode.Auto;

using(FolderBrowserDialog fbd = new FolderBrowserDialog()) {
if (fbd.ShowDialog() == DialogResult.OK) {
string inputFolder = fbd.SelectedPath;
string outputFile = Path.Combine(inputFolder, "VoterData_18Box_Export.csv");

try {
string[] files = Directory.GetFiles(inputFolder, "*.jpg");
int totalRecords = 0;

using(var writer = new StreamWriter(outputFile, false, Encoding.UTF8)) {
// Writing CSV Header
writer.WriteLine("নাম,ভোটার নং,পিতা,মাতা,পেশা,ঠিকানা");

foreach(string filePath in files) {
using(var input = new OcrInput(filePath)) {
// Image optimization for dense text
input.DeNoise();
input.Contrast();

var result = ocr.Read(input);
string fullText = result.Text;

// SPLIT LOGIC: 18 boxes usually means 18 "নাম:" labels
// We split by the label to isolate each person's box
//  string[] cardBlocks = fullText.Split(new[] { "নাম:" }, StringSplitOptions.RemoveEmptyEntries);
//  string[] cardBlocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline);
var blocks = Regex.Split(fullText, @"(?:\s+|^)নাম[:ঃ]", RegexOptions.Multiline).Where(b =>!string.IsNullOrWhiteSpace(b)).ToList();

//foreach (string block in cardBlocks)
foreach(string block in blocks) {
// Skip noise/empty blocks
if (!block.Contains("ভোটার নং:") && !block.Contains("পিতা:")) continue;

// 1. Extract Name (It's the text immediately after "নাম:" until the end of that line)
string name = GetFirstLine(block);

// 2. Extract other fields from the remaining block text
string voterId = GetValue(block, "ভোটার নং:");
string father = GetValue(block, "পিতা:");
string mother = GetValue(block, "মাতা:");
string job = GetValue(block, "পেশা:");
string address = GetValue(block, "ঠিকানা:");

// 3. Write to CSV
string line = $ "{EscapeCsv(name)},{EscapeCsv(voterId)},{EscapeCsv(father)},{EscapeCsv(mother)},{EscapeCsv(job)},{EscapeCsv(address)}";
writer.WriteLine(line);
totalRecords++;
}
writer.Flush();
}
rtbStatus.AppendText($ "Processed: {Path.GetFileName(filePath)} (Found {totalRecords} total)\n");
}
}
MessageBox.Show($ "সম্পন্ন হয়েছে!\nমোট ছবি: {files.Length}\nমোট ভোটার: {totalRecords}");
}
catch(Exception ex) {
MessageBox.Show("Error: " + ex.Message);
}
}
}
[/code]
** На моем изображении всего 18 блоков с 6 строками и 3 столбцами.

** Мне нужно читать каждую строку слева направо.

** OCR читается слева направо в каждой строке, но иногда OCR прыгает сверху вниз по строке. [b]Пример[/b]: OCR прочитал последний блок из 3 строк без строк, он должен перейти к 4 без строк и читать слева направо, но его переход к последнему блоку 4 без строк, поэтому первые два блока являются экранированием. 

Подробнее здесь: [url]https://stackoverflow.com/questions/79892683/extract-bangla-text-from-image[/url]