Как я могу извлечь пронумерованные списки, пули и выравнивание контента, если файл docx с помощью Python Docx

Как я могу извлечь пронумерованные списки, пули и выравнивание контента, если файл docx с помощью Python Docx ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Как я могу извлечь пронумерованные списки, пули и выравнивание контента, если файл docx с помощью Python Docx

Цитата

Сообщение Anonymous » 20 фев 2025, 01:18

Я могу извлечь содержимое файла DOCX с помощью Python's Python-Docx, и я генерирую HTML-файл из извлеченного контента. Тем не менее, я изо всех сил пытаюсь убедиться, что содержимое правильно выровнено, и у меня также возникают трудности с извлечением пронумерованных списков, пуль и стилей правильно. Мой текущий код функционирует хорошо, и я вставлю его ниже. Пожалуйста, помогите мне выяснить, как правильно извлечь пронумерованные списки, пули и применять стили при создании HTML. Хотя я могу извлечь стили, они не применяются правильно в сгенерированном HTML. Мои основные проблемы заключаются в обработке пронумерованных списков и пуль и выравнивания текста < /p>
код: < /p>

Код: Выделить всё

from docx.oxml.ns import qn
import json
import re
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P

def extract_elements(docx_file):
"""Extract all paragraphs and tables in the order they appear."""
document = Document(docx_file)
elements = []

for child in document.element.body:
if isinstance(child, CT_P):  # If it's a paragraph
paragraph_data = _get_paragraph(child, document)
if paragraph_data:
elements.append(paragraph_data)

elif isinstance(child, CT_Tbl):  # If it's a table
table_data = _get_table(child, document)
if table_data:
elements.append({"type": "table", "data": table_data})

return elements

def _get_paragraph(element, document):
"""Extract paragraph text, bullet/numbering style, and format."""
try:
# Find the matching paragraph in the document by element reference
for paragraph in document.paragraphs:
if paragraph._element == element:
text = paragraph.text.strip()
alignment = _get_alignment(paragraph)  # Get paragraph alignment
numbered_list = _get_numbered_list(paragraph)  # Detect numbered list

return {
"type": "paragraph",
"text": text,
"style": paragraph.style.name if paragraph.style else "",
"alignment": alignment,
"numberedlist": numbered_list,
}

except Exception as e:
print(f"Error in _get_paragraph: {e}")
return None

def _get_alignment(paragraph):
"""Get alignment of the paragraph."""
alignment_map = {0: "left", 1: "center", 2: "right", 3: "justify"}
return alignment_map.get(paragraph.alignment, "left")  # Default to left alignment

def _get_numbered_list(paragraph):
"""Detect whether the paragraph has a bullet/numbered style and retrieve its details."""
try:
# Access the XML element directly
p = paragraph._element

# Print the entire XML of the paragraph for debugging
# print(p.xml)
xml_content = p.xml

# Append the XML content to the file
with open("D://newxml.xml", "a", encoding="utf-8") as xml_file:
# Optional: Add a separator between paragraphs for clarity
xml_file.write(f"\n\n{xml_content}\n")

# Accessing the paragraph properties
pPr = p.find(qn("w:pPr"))

if pPr is not None:
# Try to find numPr in the paragraph properties
numPr = pPr.find(qn("w:numPr"))
print(f"numPr found: {numPr is not None}")  # Debugging output

if numPr is not None:
numId = numPr.find(qn("w:numId"))
ilvl = numPr.find(qn("w:ilvl"))

if numId is not None and ilvl is not None:
numId_val = numId.get(qn("w:val"))
ilvl_val = ilvl.get(qn("w:val"))
return _get_number_format(paragraph.part, numId_val, ilvl_val)
else:
print("numPr is None, check paragraph properties.")
else:
print("pPr is None, check paragraph element.")

return None
except Exception as e:
print(f"Error in _get_bullet_style:  {e}")
return None

def _get_number_format(doc_part, numId, ilvl):
"""Retrieve the numbering format from the DOCX numbering part."""
try:
numbering_part = doc_part.numbering_part  # Access numbering definitions
xpath = f".//w:num[@w:numId='{numId}']//w:abstractNumId"
abstract_num = numbering_part.element.xpath(xpath)

if abstract_num:
abstract_num_id = abstract_num[0].get(qn("w:val"))
level_xpath = f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']//w:lvl[@w:ilvl='{ilvl}']//w:numFmt"
num_fmt = numbering_part.element.xpath(level_xpath)

if num_fmt:
return {
"numId": numId,
"level": int(ilvl),
"format": num_fmt[0].get(qn("w:val")),
}
return None
except Exception as e:
print(f"Error in _get_number_format: {e}")

def _get_table(element, document):
"""Extract table data ensuring correct structure."""
for table in document.tables:
if table._element == element:
return _extract_table_content(table)
return None

def _extract_table_content(table):
"""Ensure correct table row and column consistency."""
rows = []
max_cols = max(len(row.cells) for row in table.rows)  # Get the max columns
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()

# Avoid appending duplicate content in the same row
if cell_text not in row_data:
row_data.append(cell_text)

# Ensure row has consistent number of columns
while len(row_data) < max_cols:
row_data.append("")  # Pad missing cells with empty strings

rows.append(row_data)

return rows

def save_elements_to_json(elements, file_path):
"""Save extracted elements to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(elements, f, ensure_ascii=False, indent=4)

def save_elements_to_html(elements, file_path):
"""Convert extracted elements to properly aligned HTML."""
html_content = ""
in_list = False  # Track whether we are in a list
in_toc_table = False  # Track if we're inside a TOC table
list_type = ""  # Keep track of the list type (decimal, lowerLetter, upperLetter)

for element in elements:
if element["type"] == "paragraph":
numbered_list = element.get("numberedlist")  # Safely get 'numbered_list'

# Handle TOC entries
if element["style"] == "toc 1":
if not in_toc_table:
# Start a new TOC table
html_content += ''
html_content += (
"ClauseHeadingPage"
)
in_toc_table = True  # Mark that we are inside the TOC table

# Split the text by tabs and add as a row in the table
row_data = element["text"].split("\t")
html_content += ""
for cell in row_data:
html_content += (
f'{cell}'
)
html_content += ""
else:
# Close the TOC table if we encounter non-TOC content
if in_toc_table:
html_content += ""
in_toc_table = False

# Handle numbered/bullet list
if numbered_list:  # Check if the paragraph belongs to a list
if not in_list:
# Set the list type based on the numbering format
list_format = numbered_list["format"]
if list_format == "decimal":
list_type = "1"  # Decimal list
elif list_format == "lowerLetter":
list_type = "a"  # Lowercase letters
elif list_format == "upperLetter":
list_type = "A"  # Uppercase letters
else:
list_type = ""  # Default case

html_content += f'[list]'  # Start a new ordered list
in_list = True

# Indent the list item based on its level
indent = numbered_list["level"] * 20
html_content += (
f'[*]'
f'{element["text"]}'
)
else:  # Handle non-list paragraphs
if in_list:
html_content += "[/list]"  # Close the list if we were in one
in_list = False

# Render the paragraph with alignment
alignment = element.get(
"alignment", "left"
)  # Default to 'left' if missing
style = element.get(
"style", "Normal"
)  # Default style if not available
html_content += (
f'
'
f'{element["text"]}
'
)

elif element["type"] == "table":
# Close any open list before starting a new table
if in_list:
html_content += ""
in_list = False

# Render table content
html_content += (
''
)
for row in element["data"]:
html_content += ""
for cell in row:
html_content += (
f'{cell}'
)
html_content += ""
html_content += ""

# Close any remaining open list or table
if in_list:
html_content += ""
if in_toc_table:
html_content += ""

html_content += ""

with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)

if __name__ == "__main__":

docx_file = "/Services Agreement.docx"
elements = extract_elements(docx_file)
save_elements_to_json(elements, "D://extracted_content.json")
save_elements_to_html(elements, "D://extracted_content.html")

Я ожидаю должным образом извлечь нумерации, пуль и стили

Подробнее здесь: https://stackoverflow.com/questions/791 ... ocx-file-u

1740003528

Anonymous

 Я могу извлечь содержимое файла DOCX с помощью Python's Python-Docx, и я генерирую HTML-файл из извлеченного контента. Тем не менее, я изо всех сил пытаюсь убедиться, что содержимое правильно выровнено, и у меня также возникают трудности с извлечением пронумерованных списков, пуль и стилей правильно. Мой текущий код функционирует хорошо, и я вставлю его ниже. Пожалуйста, помогите мне выяснить, как правильно извлечь пронумерованные списки, пули и применять стили при создании HTML. Хотя я могу извлечь стили, они не применяются правильно в сгенерированном HTML.  Мои основные проблемы заключаются в обработке пронумерованных списков и пуль и выравнивания текста < /p>
код: < /p>
[code]
from docx.oxml.ns import qn
import json
import re
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P

def extract_elements(docx_file):
"""Extract all paragraphs and tables in the order they appear."""
document = Document(docx_file)
elements = []

for child in document.element.body:
if isinstance(child, CT_P):  # If it's a paragraph
paragraph_data = _get_paragraph(child, document)
if paragraph_data:
elements.append(paragraph_data)

elif isinstance(child, CT_Tbl):  # If it's a table
table_data = _get_table(child, document)
if table_data:
elements.append({"type": "table", "data": table_data})

return elements

def _get_paragraph(element, document):
"""Extract paragraph text, bullet/numbering style, and format."""
try:
# Find the matching paragraph in the document by element reference
for paragraph in document.paragraphs:
if paragraph._element == element:
text = paragraph.text.strip()
alignment = _get_alignment(paragraph)  # Get paragraph alignment
numbered_list = _get_numbered_list(paragraph)  # Detect numbered list

return {
"type": "paragraph",
"text": text,
"style": paragraph.style.name if paragraph.style else "",
"alignment": alignment,
"numberedlist": numbered_list,
}

except Exception as e:
print(f"Error in _get_paragraph: {e}")
return None

def _get_alignment(paragraph):
"""Get alignment of the paragraph."""
alignment_map = {0: "left", 1: "center", 2: "right", 3: "justify"}
return alignment_map.get(paragraph.alignment, "left")  # Default to left alignment

def _get_numbered_list(paragraph):
"""Detect whether the paragraph has a bullet/numbered style and retrieve its details."""
try:
# Access the XML element directly
p = paragraph._element

# Print the entire XML of the paragraph for debugging
# print(p.xml)
xml_content = p.xml

# Append the XML content to the file
with open("D://newxml.xml", "a", encoding="utf-8") as xml_file:
# Optional: Add a separator between paragraphs for clarity
xml_file.write(f"\n\n{xml_content}\n")

# Accessing the paragraph properties
pPr = p.find(qn("w:pPr"))

if pPr is not None:
# Try to find numPr in the paragraph properties
numPr = pPr.find(qn("w:numPr"))
print(f"numPr found: {numPr is not None}")  # Debugging output

if numPr is not None:
numId = numPr.find(qn("w:numId"))
ilvl = numPr.find(qn("w:ilvl"))

if numId is not None and ilvl is not None:
numId_val = numId.get(qn("w:val"))
ilvl_val = ilvl.get(qn("w:val"))
return _get_number_format(paragraph.part, numId_val, ilvl_val)
else:
print("numPr is None, check paragraph properties.")
else:
print("pPr is None, check paragraph element.")

return None
except Exception as e:
print(f"Error in _get_bullet_style:  {e}")
return None

def _get_number_format(doc_part, numId, ilvl):
"""Retrieve the numbering format from the DOCX numbering part."""
try:
numbering_part = doc_part.numbering_part  # Access numbering definitions
xpath = f".//w:num[@w:numId='{numId}']//w:abstractNumId"
abstract_num = numbering_part.element.xpath(xpath)

if abstract_num:
abstract_num_id = abstract_num[0].get(qn("w:val"))
level_xpath = f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']//w:lvl[@w:ilvl='{ilvl}']//w:numFmt"
num_fmt = numbering_part.element.xpath(level_xpath)

if num_fmt:
return {
"numId": numId,
"level": int(ilvl),
"format": num_fmt[0].get(qn("w:val")),
}
return None
except Exception as e:
print(f"Error in _get_number_format: {e}")

def _get_table(element, document):
"""Extract table data ensuring correct structure."""
for table in document.tables:
if table._element == element:
return _extract_table_content(table)
return None

def _extract_table_content(table):
"""Ensure correct table row and column consistency."""
rows = []
max_cols = max(len(row.cells) for row in table.rows)  # Get the max columns
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()

# Avoid appending duplicate content in the same row
if cell_text not in row_data:
row_data.append(cell_text)

# Ensure row has consistent number of columns
while len(row_data) < max_cols:
row_data.append("")  # Pad missing cells with empty strings

rows.append(row_data)

return rows

def save_elements_to_json(elements, file_path):
"""Save extracted elements to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(elements, f, ensure_ascii=False, indent=4)

def save_elements_to_html(elements, file_path):
"""Convert extracted elements to properly aligned HTML."""
html_content = ""
in_list = False  # Track whether we are in a list
in_toc_table = False  # Track if we're inside a TOC table
list_type = ""  # Keep track of the list type (decimal, lowerLetter, upperLetter)

for element in elements:
if element["type"] == "paragraph":
numbered_list = element.get("numberedlist")  # Safely get 'numbered_list'

# Handle TOC entries
if element["style"] == "toc 1":
if not in_toc_table:
# Start a new TOC table
html_content += ''
html_content += (
"ClauseHeadingPage"
)
in_toc_table = True  # Mark that we are inside the TOC table

# Split the text by tabs and add as a row in the table
row_data = element["text"].split("\t")
html_content += ""
for cell in row_data:
html_content += (
f'{cell}'
)
html_content += ""
else:
# Close the TOC table if we encounter non-TOC content
if in_toc_table:
html_content += ""
in_toc_table = False

# Handle numbered/bullet list
if numbered_list:  # Check if the paragraph belongs to a list
if not in_list:
# Set the list type based on the numbering format
list_format = numbered_list["format"]
if list_format == "decimal":
list_type = "1"  # Decimal list
elif list_format == "lowerLetter":
list_type = "a"  # Lowercase letters
elif list_format == "upperLetter":
list_type = "A"  # Uppercase letters
else:
list_type = ""  # Default case

html_content += f'[list]'  # Start a new ordered list
in_list = True

# Indent the list item based on its level
indent = numbered_list["level"] * 20
html_content += (
f'[*]'
f'{element["text"]}'
)
else:  # Handle non-list paragraphs
if in_list:
html_content += "[/list]"  # Close the list if we were in one
in_list = False

# Render the paragraph with alignment
alignment = element.get(
"alignment", "left"
)  # Default to 'left' if missing
style = element.get(
"style", "Normal"
)  # Default style if not available
html_content += (
f'
'
f'{element["text"]}
'
)

elif element["type"] == "table":
# Close any open list before starting a new table
if in_list:
html_content += ""
in_list = False

# Render table content
html_content += (
''
)
for row in element["data"]:
html_content += ""
for cell in row:
html_content += (
f'{cell}'
)
html_content += ""
html_content += ""

# Close any remaining open list or table
if in_list:
html_content += ""
if in_toc_table:
html_content += ""

html_content += ""

with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)

if __name__ == "__main__":

docx_file = "/Services Agreement.docx"
elements = extract_elements(docx_file)
save_elements_to_json(elements, "D://extracted_content.json")
save_elements_to_html(elements, "D://extracted_content.html")

[/code]
Я ожидаю должным образом извлечь нумерации, пуль и стили  

Подробнее здесь: [url]https://stackoverflow.com/questions/79109719/how-can-i-extract-numbered-lists-bullets-and-alignment-of-content-if-docx-file-u[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Как преобразовать строку с новой линией и вкладками в вложенные пули в Word Docx с помощью Python-Docx [дублировать]

Последнее сообщение Anonymous « 25 фев 2025, 10:50
Добавлено в форуме Python

Anonymous » 25 фев 2025, 10:50 » в форуме Python

Преобразовать сгенерированный текст Markdown LLM в подходящую пулю /суб-буллет. Чтобы преобразовать сгенерированный LLM многобуллет и суббаллет в пулю слов. Нижняя суббаллет. Руководство по применению в стиле блоков (квадратная пуля или целая цирле)...

0 Ответы

16 Просмотры

Последнее сообщение Anonymous
25 фев 2025, 10:50
Как я могу использовать для цикла для итерации через пронумерованные переменные?

Последнее сообщение Anonymous « 29 янв 2025, 08:25
Добавлено в форуме Python

Anonymous » 29 янв 2025, 08:25 » в форуме Python

У меня есть 20 переменных, которые называются line1 , line2 , line3 , ..., до line20 .
Есть ли способ в Python использовать для цикл, чтобы ссылаться на каждую из этих переменных в последовательности, то есть итерации через них? Что -то для этого:...

0 Ответы

11 Просмотры

Последнее сообщение Anonymous
29 янв 2025, 08:25
Почему только ровные пронумерованные порты выбираются из диапазона эфемерных портов

Последнее сообщение Anonymous « 09 июл 2025, 12:12
Добавлено в форуме JAVA

Anonymous » 09 июл 2025, 12:12 » в форуме JAVA

У меня есть сервер linux , на котором работает приложение java . Это приложение превращает много исходящих TCP -соединений к конкретному розетку. Когда я получаю tcpdump , я вижу, что только даже пронумерованные номера портов принимаются в качестве...

0 Ответы

2 Просмотры

Последнее сообщение Anonymous
09 июл 2025, 12:12
Как я могу заставить списки работать внутри функций, когда ошибка вызывает списки, не подлежащие подписке [закрыто]

Последнее сообщение Anonymous « 08 дек 2023, 10:15
Добавлено в форуме Python

Anonymous » 08 дек 2023, 10:15 » в форуме Python

Когда я запускаю свою программу, она раздражается:
Значение
= позиция + int(hazards ) ~~~~~~~^^^ TypeError: объект 'int' не подлежит подписке Как бы я сделал, чтобы список опасностей работал внутри функции trap?

def main(): # Настраивать длина...

0 Ответы

70 Просмотры

Последнее сообщение Anonymous
08 дек 2023, 10:15
Невозможно пройти проверку, потому что по какой -то причине списки, которые заполняют раскрывающиеся списки, являются не

Последнее сообщение Anonymous « 03 июл 2025, 18:55
Добавлено в форуме C#

Anonymous » 03 июл 2025, 18:55 » в форуме C#

liledingcontroller.cs:
public IActionResult Create()
{
CreateListingViewModel model = new()
{
AllCities = new SelectList(_addressRepo.AllCities()),
AllItems = new SelectList(_itemRepo.FindAll(), Id , Name )
};
return View(model);
}

public...

0 Ответы

11 Просмотры

Последнее сообщение Anonymous
03 июл 2025, 18:55

Вернуться в «Python»