Как получить правильную ограничивающую рамку списка и таблицы с помощью pdfbox из PDF-документа с тегами

Как получить правильную ограничивающую рамку списка и таблицы с помощью pdfbox из PDF-документа с тегами ⇐ JAVA

1 сообщение • Страница 1 из 1

Anonymous

Как получить правильную ограничивающую рамку списка и таблицы с помощью pdfbox из PDF-документа с тегами

Цитата

Сообщение Anonymous » 26 июн 2024, 07:35

Я пытаюсь сохранить тег из документа PDF с тегами. Я попробовал один подход, но мне нужно

получить правильную ограничивающую рамку таблицы и списка, используя следующий код. пожалуйста, помогите кому-нибудь в этом.

Код: Выделить всё

    public void Process(TaggedJsonPart taggedJsonPart) throws IOException {
Map
> markedContents = new HashMap();

for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map theseMarkedContents = new HashMap();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
addToMap(theseMarkedContents, markedContent);
}
}

PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
Map visualizations = new HashMap();

showStructure(document, root, markedContents, visualizations, 0, null, null);

}

Код: Выделить всё

Map showStructure(PDDocument document, PDStructureNode node,
Map markedContents, Map visualizations,
Integer _depth, TaggedPdfContent pg, List pages) throws IOException {

Map boxes = null;
String structType = null;
PDPage page = null;
//        page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map theseMarkedContents = markedContents.get(page);

int indexHere = index;
if ("Document".equalsIgnoreCase(structType) || "Sect".equalsIgnoreCase(structType)
|| "Part".equalsIgnoreCase(structType)) {
index = 0;
} else {
indexHere = index++;
}

for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
boxes = union(boxes, showStructure(document, PDStructureNode.create((COSDictionary) base),
markedContents, visualizations, null, pg, pages));

} else if (base instanceof COSNumber) {
boxes = union(boxes, page, showContent(((COSNumber) base).intValue(), theseMarkedContents));
} else {
// System.out.printf("?%s\n", base);
}
}
} else if (object instanceof PDStructureNode) {

if (object instanceof PDStructureElement) {
// isHavingPdStrEle =true;
_depth++;
page = ((PDStructureElement) object).getPage();
if (((PDStructureElement) object).getStandardStructureType().equals("Note")) {

Map  nboxes = union(null, showStructure(document, (PDStructureNode) object,
markedContents, visualizations, _depth, pg, pages));
this.insetNodeEleInfo(nboxes, indexHere, structType, new ArrayList(), null);
continue;
} else if (((PDStructureElement) object).getStandardStructureType().equals("Figure")) {

}
}
boxes = union(boxes, showStructure(document, (PDStructureNode) object, markedContents, visualizations,
_depth, pg, pages));
_depth--;

} else if (object instanceof Integer) {
if (page == null) {
page = ((PDStructureElement) node).getPage();
}
boxes = union(boxes, page, showContent((Integer) object, theseMarkedContents));

} else if (object instanceof PDMarkedContentReference) {
PDPage mcr_page = ((PDMarkedContentReference) object).getPage();
boxes = union(boxes, mcr_page, showContent((Integer) ((PDMarkedContentReference) object).getMCID(),
markedContents.get(mcr_page)));
} else {
System.out.printf("?%s\n", object);
}

}
if (structType != null && !structType.equalsIgnoreCase("Sect") && !structType.equalsIgnoreCase("Part")
&& !structType.equalsIgnoreCase("Document")) {

if (structType.equals("Figure")) {
// insetFigureNodeEleInfo(((PDStructureElement)node),indexHere);
} else {
List _childIds = this._ChildrenObjIdDepthWiseMap.get(_depth);
if (_childIds == null) {
_childIds = new ArrayList();

}
String ObjIDWithEleType = this.insetNodeEleInfo(boxes, indexHere, structType, _childIds, null);
_childIds = new ArrayList();
this._ChildrenObjIdDepthWiseMap.put(_depth, new ArrayList());
List _prevchildIds = this._ChildrenObjIdDepthWiseMap.get(_depth - 1);
if (_prevchildIds == null) {
_prevchildIds = new ArrayList();
this._ChildrenObjIdDepthWiseMap.put(_depth - 1, _prevchildIds);

}
_prevchildIds.add(ObjIDWithEleType);
}

}

return boxes;
}

Код: Выделить всё

Map union(Map map, PDPage page, Rectangle2D rectangle) {
if (map == null)
map = new HashMap();
map.put(page, union(map.get(page), rectangle));
return map;
}

Код: Выделить всё

Rectangle2D showContent(int mcid, Map theseMarkedContents) throws IOException {
Rectangle2D box = null;
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
TextPosition textPosition = (TextPosition) object;
textContent.append(textPosition.getUnicode());
int[] codes = textPosition.getCharacterCodes();
if (codes.length != 1) {
System.out.printf("", codes.length);
} else {
box = union(box,
calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0])
.getBounds2D());
}
} else if (object instanceof PDMarkedContent) {
PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
} else {
textContent.append("?"  + object);
}
}

return box;
}

Код: Выделить всё

private Shape calculateGlyphBounds(Matrix textRenderingMatrix, PDFont font, int code) throws IOException {
GeneralPath path = null;
AffineTransform at = textRenderingMatrix.createAffineTransform();
at.concatenate(font.getFontMatrix().createAffineTransform());
if (font instanceof PDType3Font) {
// It is difficult to calculate the real individual glyph bounds for type 3
// fonts
// because these are not vector fonts, the content stream could contain almost
// anything
// that is found in page content streams.
PDType3Font t3Font = (PDType3Font) font;
PDType3CharProc charProc = t3Font.getCharProc(code);
if (charProc != null) {
BoundingBox fontBBox = t3Font.getBoundingBox();
PDRectangle glyphBBox = charProc.getGlyphBBox();
if (glyphBBox != null) {
// PDFBOX-3850: glyph bbox could be larger than the font bbox
glyphBBox.setLowerLeftX(Math.max(fontBBox.getLowerLeftX(), glyphBBox.getLowerLeftX()));
glyphBBox.setLowerLeftY(Math.max(fontBBox.getLowerLeftY(), glyphBBox.getLowerLeftY()));
glyphBBox.setUpperRightX(Math.min(fontBBox.getUpperRightX(), glyphBBox.getUpperRightX()));
glyphBBox.setUpperRightY(Math.min(fontBBox.getUpperRightY(), glyphBBox.getUpperRightY()));
path = glyphBBox.toGeneralPath();
}
}
} else if (font instanceof PDVectorFont) {
PDVectorFont vectorFont = (PDVectorFont) font;
path = vectorFont.getPath(code);

if (font instanceof PDTrueTypeFont) {
PDTrueTypeFont ttFont = (PDTrueTypeFont) font;
int unitsPerEm = ttFont.getTrueTypeFont().getHeader().getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
if (font instanceof PDType0Font) {
PDType0Font t0font = (PDType0Font) font;
if (t0font.getDescendantFont() instanceof PDCIDFontType2) {
int unitsPerEm = ((PDCIDFontType2) t0font.getDescendantFont()).getTrueTypeFont().getHeader()
.getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
}
} else if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;

// these two lines do not always work, e.g. for the TT fonts in file 032431.pdf
// which is why PDVectorFont is tried first.
String name = simpleFont.getEncoding().getName(code);
path = simpleFont.getPath(name);
} else {
// shouldn't happen, please open issue in JIRA
System.out.println("Unknown font class: " + font.getClass());
}
if (path == null) {
return null;
}

return at.createTransformedShape(path.getBounds2D());
}

Итак, я использовал приведенный выше код, чтобы сохранить тег из PDF-файла с тегами и сохранить его как JSon. Но ограничивающая рамка таблицы и списка не становится правильной. Ограничение абзацев и заголовков корректируется согласно приведенной выше логике. поэтому, пожалуйста, помогите. Заранее спасибо.

Подробнее здесь: https://stackoverflow.com/questions/786 ... gged-pdf-d

1719376559

Anonymous

Я пытаюсь сохранить тег из документа PDF с тегами. Я попробовал один подход, но мне нужно

получить правильную ограничивающую рамку таблицы и списка, используя следующий код.  пожалуйста, помогите кому-нибудь в этом.
[code]    public void Process(TaggedJsonPart taggedJsonPart) throws IOException {
Map
> markedContents = new HashMap();

for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map theseMarkedContents = new HashMap();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
addToMap(theseMarkedContents, markedContent);
}
}

PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
Map visualizations = new HashMap();

showStructure(document, root, markedContents, visualizations, 0, null, null);

}
[/code]
[code]Map showStructure(PDDocument document, PDStructureNode node,
Map markedContents, Map visualizations,
Integer _depth, TaggedPdfContent pg, List pages) throws IOException {

Map boxes = null;
String structType = null;
PDPage page = null;
//        page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map theseMarkedContents = markedContents.get(page);

int indexHere = index;
if ("Document".equalsIgnoreCase(structType) || "Sect".equalsIgnoreCase(structType)
|| "Part".equalsIgnoreCase(structType)) {
index = 0;
} else {
indexHere = index++;
}

for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
boxes = union(boxes, showStructure(document, PDStructureNode.create((COSDictionary) base),
markedContents, visualizations, null, pg, pages));

} else if (base instanceof COSNumber) {
boxes = union(boxes, page, showContent(((COSNumber) base).intValue(), theseMarkedContents));
} else {
// System.out.printf("?%s\n", base);
}
}
} else if (object instanceof PDStructureNode) {

if (object instanceof PDStructureElement) {
// isHavingPdStrEle =true;
_depth++;
page = ((PDStructureElement) object).getPage();
if (((PDStructureElement) object).getStandardStructureType().equals("Note")) {

Map  nboxes = union(null, showStructure(document, (PDStructureNode) object,
markedContents, visualizations, _depth, pg, pages));
this.insetNodeEleInfo(nboxes, indexHere, structType, new ArrayList(), null);
continue;
} else if (((PDStructureElement) object).getStandardStructureType().equals("Figure")) {

}
}
boxes = union(boxes, showStructure(document, (PDStructureNode) object, markedContents, visualizations,
_depth, pg, pages));
_depth--;

} else if (object instanceof Integer) {
if (page == null) {
page = ((PDStructureElement) node).getPage();
}
boxes = union(boxes, page, showContent((Integer) object, theseMarkedContents));

} else if (object instanceof PDMarkedContentReference) {
PDPage mcr_page = ((PDMarkedContentReference) object).getPage();
boxes = union(boxes, mcr_page, showContent((Integer) ((PDMarkedContentReference) object).getMCID(),
markedContents.get(mcr_page)));
} else {
System.out.printf("?%s\n", object);
}

}
if (structType != null && !structType.equalsIgnoreCase("Sect") && !structType.equalsIgnoreCase("Part")
&& !structType.equalsIgnoreCase("Document")) {

if (structType.equals("Figure")) {
// insetFigureNodeEleInfo(((PDStructureElement)node),indexHere);
} else {
List _childIds = this._ChildrenObjIdDepthWiseMap.get(_depth);
if (_childIds == null) {
_childIds = new ArrayList();

}
String ObjIDWithEleType = this.insetNodeEleInfo(boxes, indexHere, structType, _childIds, null);
_childIds = new ArrayList();
this._ChildrenObjIdDepthWiseMap.put(_depth, new ArrayList());
List _prevchildIds = this._ChildrenObjIdDepthWiseMap.get(_depth - 1);
if (_prevchildIds == null) {
_prevchildIds = new ArrayList();
this._ChildrenObjIdDepthWiseMap.put(_depth - 1, _prevchildIds);

}
_prevchildIds.add(ObjIDWithEleType);
}

}

return boxes;
}
[/code]
[code]Map union(Map map, PDPage page, Rectangle2D rectangle) {
if (map == null)
map = new HashMap();
map.put(page, union(map.get(page), rectangle));
return map;
}
[/code]
[code]Rectangle2D showContent(int mcid, Map theseMarkedContents) throws IOException {
Rectangle2D box = null;
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
TextPosition textPosition = (TextPosition) object;
textContent.append(textPosition.getUnicode());
int[] codes = textPosition.getCharacterCodes();
if (codes.length != 1) {
System.out.printf("", codes.length);
} else {
box = union(box,
calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0])
.getBounds2D());
}
} else if (object instanceof PDMarkedContent) {
PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
} else {
textContent.append("?"  + object);
}
}

return box;
}
[/code]
[code]private Shape calculateGlyphBounds(Matrix textRenderingMatrix, PDFont font, int code) throws IOException {
GeneralPath path = null;
AffineTransform at = textRenderingMatrix.createAffineTransform();
at.concatenate(font.getFontMatrix().createAffineTransform());
if (font instanceof PDType3Font) {
// It is difficult to calculate the real individual glyph bounds for type 3
// fonts
// because these are not vector fonts, the content stream could contain almost
// anything
// that is found in page content streams.
PDType3Font t3Font = (PDType3Font) font;
PDType3CharProc charProc = t3Font.getCharProc(code);
if (charProc != null) {
BoundingBox fontBBox = t3Font.getBoundingBox();
PDRectangle glyphBBox = charProc.getGlyphBBox();
if (glyphBBox != null) {
// PDFBOX-3850: glyph bbox could be larger than the font bbox
glyphBBox.setLowerLeftX(Math.max(fontBBox.getLowerLeftX(), glyphBBox.getLowerLeftX()));
glyphBBox.setLowerLeftY(Math.max(fontBBox.getLowerLeftY(), glyphBBox.getLowerLeftY()));
glyphBBox.setUpperRightX(Math.min(fontBBox.getUpperRightX(), glyphBBox.getUpperRightX()));
glyphBBox.setUpperRightY(Math.min(fontBBox.getUpperRightY(), glyphBBox.getUpperRightY()));
path = glyphBBox.toGeneralPath();
}
}
} else if (font instanceof PDVectorFont) {
PDVectorFont vectorFont = (PDVectorFont) font;
path = vectorFont.getPath(code);

if (font instanceof PDTrueTypeFont) {
PDTrueTypeFont ttFont = (PDTrueTypeFont) font;
int unitsPerEm = ttFont.getTrueTypeFont().getHeader().getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
if (font instanceof PDType0Font) {
PDType0Font t0font = (PDType0Font) font;
if (t0font.getDescendantFont() instanceof PDCIDFontType2) {
int unitsPerEm = ((PDCIDFontType2) t0font.getDescendantFont()).getTrueTypeFont().getHeader()
.getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
}
} else if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;

// these two lines do not always work, e.g. for the TT fonts in file 032431.pdf
// which is why PDVectorFont is tried first.
String name = simpleFont.getEncoding().getName(code);
path = simpleFont.getPath(name);
} else {
// shouldn't happen, please open issue in JIRA
System.out.println("Unknown font class: " + font.getClass());
}
if (path == null) {
return null;
}

return at.createTransformedShape(path.getBounds2D());
}
[/code]
Итак, я использовал приведенный выше код, чтобы сохранить тег из PDF-файла с тегами и сохранить его как JSon. Но ограничивающая рамка таблицы и списка не становится правильной. Ограничение абзацев и заголовков корректируется согласно приведенной выше логике. поэтому, пожалуйста, помогите.  Заранее спасибо.
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78670122/how-to-get-correct-bounding-box-of-list-and-table-using-pdfbox-from-tagged-pdf-d[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Как получить правильную ограничивающую рамку списка и таблицы с помощью pdfbox из PDF-документа с тегами

Последнее сообщение Anonymous « 26 июн 2024, 11:13
Добавлено в форуме JAVA

Anonymous » 26 июн 2024, 11:13 » в форуме JAVA

Я пытаюсь сохранить тег из документа PDF с тегами. Я попробовал один подход, но мне нужно

получить правильную ограничивающую рамку таблицы и списка, используя следующий код. пожалуйста, помогите кому-нибудь в этом.
public void...

0 Ответы

29 Просмотры

Последнее сообщение Anonymous
26 июн 2024, 11:13
PDFBox получает ограничивающую рамку текстовых результатов неправильного размера

Последнее сообщение Anonymous « 23 июл 2024, 13:03
Добавлено в форуме JAVA

Anonymous » 23 июл 2024, 13:03 » в форуме JAVA

Я хочу использовать PDFBox, чтобы получить тексты из PDF-файла с их ограничивающими рамками. Мне удалось собрать здесь код, который делает почти это, но, как вы можете видеть, полученные мной ограничивающие рамки (нарисованные синим цветом) имеют...

0 Ответы

9 Просмотры

Последнее сообщение Anonymous
23 июл 2024, 13:03
Как удалить изображения с тегами из PDF-файла с помощью PDFBox в Android Java

Последнее сообщение Anonymous « 01 июл 2024, 18:27
Добавлено в форуме JAVA

Anonymous » 01 июл 2024, 18:27 » в форуме JAVA

Я разрабатываю приложение для Android, в котором мне нужно добавлять и удалять изображения из PDF-документа. Для этой цели я использую библиотеку PDFBox. Я успешно добавил изображения в PDF-файл с определенным тегом, но пытаюсь удалить их на основе...

0 Ответы

12 Просмотры

Последнее сообщение Anonymous
01 июл 2024, 18:27
Как удалить изображения с тегами из PDF-файла с помощью PDFBox в Android Java

Последнее сообщение Anonymous « 01 июл 2024, 18:27
Добавлено в форуме Android

Anonymous » 01 июл 2024, 18:27 » в форуме Android

Я разрабатываю приложение для Android, в котором мне нужно добавлять и удалять изображения из PDF-документа. Для этой цели я использую библиотеку PDFBox. Я успешно добавил изображения в PDF-файл с определенным тегом, но пытаюсь удалить их на основе...

0 Ответы

9 Просмотры

Последнее сообщение Anonymous
01 июл 2024, 18:27
Как добавить тег аннотации в PDF с тегами с помощью PDFBox

Последнее сообщение Anonymous « 15 окт 2024, 11:59
Добавлено в форуме JAVA

Anonymous » 15 окт 2024, 11:59 » в форуме JAVA

У меня есть PDF-файл с тегами (соответствует PDF-UA) с простой структурой:

Мне нужно добавить аннотацию:

Sample comment

во вложенном теге Annot в теге P. Вот так (я сделал вручную в Adobe Acrobat):

Я могу импортировать XML-файл...

0 Ответы

13 Просмотры

Последнее сообщение Anonymous
15 окт 2024, 11:59

Вернуться в «JAVA»