Я могу извлечь содержимое файла DOCX с помощью Python's Python-Docx, и я генерирую HTML-файл из извлеченного контента. Тем не менее, я изо всех сил пытаюсь убедиться, что содержимое правильно выровнено, и у меня также возникают трудности с извлечением пронумерованных списков, пуль и стилей правильно. Мой текущий код функционирует хорошо, и я вставлю его ниже. Пожалуйста, помогите мне выяснить, как правильно извлечь пронумерованные списки, пули и применять стили при создании HTML. Хотя я могу извлечь стили, они не применяются правильно в сгенерированном HTML. Мои основные проблемы заключаются в обработке пронумерованных списков и пуль и выравнивания текста < /p>
код: < /p>
from docx.oxml.ns import qn
import json
import re
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
def extract_elements(docx_file):
"""Extract all paragraphs and tables in the order they appear."""
document = Document(docx_file)
elements = []
for child in document.element.body:
if isinstance(child, CT_P): # If it's a paragraph
paragraph_data = _get_paragraph(child, document)
if paragraph_data:
elements.append(paragraph_data)
elif isinstance(child, CT_Tbl): # If it's a table
table_data = _get_table(child, document)
if table_data:
elements.append({"type": "table", "data": table_data})
return elements
def _get_paragraph(element, document):
"""Extract paragraph text, bullet/numbering style, and format."""
try:
# Find the matching paragraph in the document by element reference
for paragraph in document.paragraphs:
if paragraph._element == element:
text = paragraph.text.strip()
alignment = _get_alignment(paragraph) # Get paragraph alignment
numbered_list = _get_numbered_list(paragraph) # Detect numbered list
return {
"type": "paragraph",
"text": text,
"style": paragraph.style.name if paragraph.style else "",
"alignment": alignment,
"numberedlist": numbered_list,
}
except Exception as e:
print(f"Error in _get_paragraph: {e}")
return None
def _get_alignment(paragraph):
"""Get alignment of the paragraph."""
alignment_map = {0: "left", 1: "center", 2: "right", 3: "justify"}
return alignment_map.get(paragraph.alignment, "left") # Default to left alignment
def _get_numbered_list(paragraph):
"""Detect whether the paragraph has a bullet/numbered style and retrieve its details."""
try:
# Access the XML element directly
p = paragraph._element
# Print the entire XML of the paragraph for debugging
# print(p.xml)
xml_content = p.xml
# Append the XML content to the file
with open("D://newxml.xml", "a", encoding="utf-8") as xml_file:
# Optional: Add a separator between paragraphs for clarity
xml_file.write(f"\n\n{xml_content}\n")
# Accessing the paragraph properties
pPr = p.find(qn("w:pPr"))
if pPr is not None:
# Try to find numPr in the paragraph properties
numPr = pPr.find(qn("w:numPr"))
print(f"numPr found: {numPr is not None}") # Debugging output
if numPr is not None:
numId = numPr.find(qn("w:numId"))
ilvl = numPr.find(qn("w:ilvl"))
if numId is not None and ilvl is not None:
numId_val = numId.get(qn("w:val"))
ilvl_val = ilvl.get(qn("w:val"))
return _get_number_format(paragraph.part, numId_val, ilvl_val)
else:
print("numPr is None, check paragraph properties.")
else:
print("pPr is None, check paragraph element.")
return None
except Exception as e:
print(f"Error in _get_bullet_style: {e}")
return None
def _get_number_format(doc_part, numId, ilvl):
"""Retrieve the numbering format from the DOCX numbering part."""
try:
numbering_part = doc_part.numbering_part # Access numbering definitions
xpath = f".//w:num[@w:numId='{numId}']//w:abstractNumId"
abstract_num = numbering_part.element.xpath(xpath)
if abstract_num:
abstract_num_id = abstract_num[0].get(qn("w:val"))
level_xpath = f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']//w:lvl[@w:ilvl='{ilvl}']//w:numFmt"
num_fmt = numbering_part.element.xpath(level_xpath)
if num_fmt:
return {
"numId": numId,
"level": int(ilvl),
"format": num_fmt[0].get(qn("w:val")),
}
return None
except Exception as e:
print(f"Error in _get_number_format: {e}")
def _get_table(element, document):
"""Extract table data ensuring correct structure."""
for table in document.tables:
if table._element == element:
return _extract_table_content(table)
return None
def _extract_table_content(table):
"""Ensure correct table row and column consistency."""
rows = []
max_cols = max(len(row.cells) for row in table.rows) # Get the max columns
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()
# Avoid appending duplicate content in the same row
if cell_text not in row_data:
row_data.append(cell_text)
# Ensure row has consistent number of columns
while len(row_data) < max_cols:
row_data.append("") # Pad missing cells with empty strings
rows.append(row_data)
return rows
def save_elements_to_json(elements, file_path):
"""Save extracted elements to a JSON file."""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(elements, f, ensure_ascii=False, indent=4)
def save_elements_to_html(elements, file_path):
"""Convert extracted elements to properly aligned HTML."""
html_content = ""
in_list = False # Track whether we are in a list
in_toc_table = False # Track if we're inside a TOC table
list_type = "" # Keep track of the list type (decimal, lowerLetter, upperLetter)
for element in elements:
if element["type"] == "paragraph":
numbered_list = element.get("numberedlist") # Safely get 'numbered_list'
# Handle TOC entries
if element["style"] == "toc 1":
if not in_toc_table:
# Start a new TOC table
html_content += ''
html_content += (
"ClauseHeadingPage"
)
in_toc_table = True # Mark that we are inside the TOC table
# Split the text by tabs and add as a row in the table
row_data = element["text"].split("\t")
html_content += ""
for cell in row_data:
html_content += (
f'{cell}'
)
html_content += ""
else:
# Close the TOC table if we encounter non-TOC content
if in_toc_table:
html_content += ""
in_toc_table = False
# Handle numbered/bullet list
if numbered_list: # Check if the paragraph belongs to a list
if not in_list:
# Set the list type based on the numbering format
list_format = numbered_list["format"]
if list_format == "decimal":
list_type = "1" # Decimal list
elif list_format == "lowerLetter":
list_type = "a" # Lowercase letters
elif list_format == "upperLetter":
list_type = "A" # Uppercase letters
else:
list_type = "" # Default case
html_content += f'[list]' # Start a new ordered list
in_list = True
# Indent the list item based on its level
indent = numbered_list["level"] * 20
html_content += (
f'[*]'
f'{element["text"]}'
)
else: # Handle non-list paragraphs
if in_list:
html_content += "[/list]" # Close the list if we were in one
in_list = False
# Render the paragraph with alignment
alignment = element.get(
"alignment", "left"
) # Default to 'left' if missing
style = element.get(
"style", "Normal"
) # Default style if not available
html_content += (
f'
'
f'{element["text"]}
'
)
elif element["type"] == "table":
# Close any open list before starting a new table
if in_list:
html_content += ""
in_list = False
# Render table content
html_content += (
''
)
for row in element["data"]:
html_content += ""
for cell in row:
html_content += (
f'{cell}'
)
html_content += ""
html_content += ""
# Close any remaining open list or table
if in_list:
html_content += ""
if in_toc_table:
html_content += ""
html_content += ""
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_content)
if __name__ == "__main__":
docx_file = "/Services Agreement.docx"
elements = extract_elements(docx_file)
save_elements_to_json(elements, "D://extracted_content.json")
save_elements_to_html(elements, "D://extracted_content.html")
Я ожидаю должным образом извлечь нумерации, пуль и стили
Я могу извлечь содержимое файла DOCX с помощью Python's Python-Docx, и я генерирую HTML-файл из извлеченного контента. Тем не менее, я изо всех сил пытаюсь убедиться, что содержимое правильно выровнено, и у меня также возникают трудности с извлечением пронумерованных списков, пуль и стилей правильно. Мой текущий код функционирует хорошо, и я вставлю его ниже. Пожалуйста, помогите мне выяснить, как правильно извлечь пронумерованные списки, пули и применять стили при создании HTML. Хотя я могу извлечь стили, они не применяются правильно в сгенерированном HTML. Мои основные проблемы заключаются в обработке пронумерованных списков и пуль и выравнивания текста < /p> код: < /p> [code] from docx.oxml.ns import qn import json import re from docx import Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P
def extract_elements(docx_file): """Extract all paragraphs and tables in the order they appear.""" document = Document(docx_file) elements = []
for child in document.element.body: if isinstance(child, CT_P): # If it's a paragraph paragraph_data = _get_paragraph(child, document) if paragraph_data: elements.append(paragraph_data)
elif isinstance(child, CT_Tbl): # If it's a table table_data = _get_table(child, document) if table_data: elements.append({"type": "table", "data": table_data})
return elements
def _get_paragraph(element, document): """Extract paragraph text, bullet/numbering style, and format.""" try: # Find the matching paragraph in the document by element reference for paragraph in document.paragraphs: if paragraph._element == element: text = paragraph.text.strip() alignment = _get_alignment(paragraph) # Get paragraph alignment numbered_list = _get_numbered_list(paragraph) # Detect numbered list
except Exception as e: print(f"Error in _get_paragraph: {e}") return None
def _get_alignment(paragraph): """Get alignment of the paragraph.""" alignment_map = {0: "left", 1: "center", 2: "right", 3: "justify"} return alignment_map.get(paragraph.alignment, "left") # Default to left alignment
def _get_numbered_list(paragraph): """Detect whether the paragraph has a bullet/numbered style and retrieve its details.""" try: # Access the XML element directly p = paragraph._element
# Print the entire XML of the paragraph for debugging # print(p.xml) xml_content = p.xml
# Append the XML content to the file with open("D://newxml.xml", "a", encoding="utf-8") as xml_file: # Optional: Add a separator between paragraphs for clarity xml_file.write(f"\n\n{xml_content}\n")
# Accessing the paragraph properties pPr = p.find(qn("w:pPr"))
if pPr is not None: # Try to find numPr in the paragraph properties numPr = pPr.find(qn("w:numPr")) print(f"numPr found: {numPr is not None}") # Debugging output
if numPr is not None: numId = numPr.find(qn("w:numId")) ilvl = numPr.find(qn("w:ilvl"))
if numId is not None and ilvl is not None: numId_val = numId.get(qn("w:val")) ilvl_val = ilvl.get(qn("w:val")) return _get_number_format(paragraph.part, numId_val, ilvl_val) else: print("numPr is None, check paragraph properties.") else: print("pPr is None, check paragraph element.")
return None except Exception as e: print(f"Error in _get_bullet_style: {e}") return None
def _get_number_format(doc_part, numId, ilvl): """Retrieve the numbering format from the DOCX numbering part.""" try: numbering_part = doc_part.numbering_part # Access numbering definitions xpath = f".//w:num[@w:numId='{numId}']//w:abstractNumId" abstract_num = numbering_part.element.xpath(xpath)
if num_fmt: return { "numId": numId, "level": int(ilvl), "format": num_fmt[0].get(qn("w:val")), } return None except Exception as e: print(f"Error in _get_number_format: {e}")
def _get_table(element, document): """Extract table data ensuring correct structure.""" for table in document.tables: if table._element == element: return _extract_table_content(table) return None
def _extract_table_content(table): """Ensure correct table row and column consistency.""" rows = [] max_cols = max(len(row.cells) for row in table.rows) # Get the max columns for row in table.rows: row_data = [] for cell in row.cells: cell_text = cell.text.strip()
# Avoid appending duplicate content in the same row if cell_text not in row_data: row_data.append(cell_text)
# Ensure row has consistent number of columns while len(row_data) < max_cols: row_data.append("") # Pad missing cells with empty strings
rows.append(row_data)
return rows
def save_elements_to_json(elements, file_path): """Save extracted elements to a JSON file.""" with open(file_path, "w", encoding="utf-8") as f: json.dump(elements, f, ensure_ascii=False, indent=4)
def save_elements_to_html(elements, file_path): """Convert extracted elements to properly aligned HTML.""" html_content = "" in_list = False # Track whether we are in a list in_toc_table = False # Track if we're inside a TOC table list_type = "" # Keep track of the list type (decimal, lowerLetter, upperLetter)
for element in elements: if element["type"] == "paragraph": numbered_list = element.get("numberedlist") # Safely get 'numbered_list'
# Handle TOC entries if element["style"] == "toc 1": if not in_toc_table: # Start a new TOC table html_content += '' html_content += ( "ClauseHeadingPage" ) in_toc_table = True # Mark that we are inside the TOC table
# Split the text by tabs and add as a row in the table row_data = element["text"].split("\t") html_content += "" for cell in row_data: html_content += ( f'{cell}' ) html_content += "" else: # Close the TOC table if we encounter non-TOC content if in_toc_table: html_content += "" in_toc_table = False
# Handle numbered/bullet list if numbered_list: # Check if the paragraph belongs to a list if not in_list: # Set the list type based on the numbering format list_format = numbered_list["format"] if list_format == "decimal": list_type = "1" # Decimal list elif list_format == "lowerLetter": list_type = "a" # Lowercase letters elif list_format == "upperLetter": list_type = "A" # Uppercase letters else: list_type = "" # Default case
html_content += f'[list]' # Start a new ordered list in_list = True
# Indent the list item based on its level indent = numbered_list["level"] * 20 html_content += ( f'[*]' f'{element["text"]}' ) else: # Handle non-list paragraphs if in_list: html_content += "[/list]" # Close the list if we were in one in_list = False
# Render the paragraph with alignment alignment = element.get( "alignment", "left" ) # Default to 'left' if missing style = element.get( "style", "Normal" ) # Default style if not available html_content += ( f' ' f'{element["text"]} ' )
elif element["type"] == "table": # Close any open list before starting a new table if in_list: html_content += "" in_list = False
# Render table content html_content += ( '' ) for row in element["data"]: html_content += "" for cell in row: html_content += ( f'{cell}' ) html_content += "" html_content += ""
# Close any remaining open list or table if in_list: html_content += "" if in_toc_table: html_content += ""
html_content += ""
with open(file_path, "w", encoding="utf-8") as f: f.write(html_content)
Преобразовать сгенерированный текст Markdown LLM в подходящую пулю /суб-буллет. Чтобы преобразовать сгенерированный LLM многобуллет и суббаллет в пулю слов. Нижняя суббаллет. Руководство по применению в стиле блоков (квадратная пуля или целая цирле)...
У меня есть 20 переменных, которые называются line1 , line2 , line3 , ..., до line20 .
Есть ли способ в Python использовать для цикл, чтобы ссылаться на каждую из этих переменных в последовательности, то есть итерации через них? Что -то для этого:...
У меня есть сервер linux , на котором работает приложение java . Это приложение превращает много исходящих TCP -соединений к конкретному розетку. Когда я получаю tcpdump , я вижу, что только даже пронумерованные номера портов принимаются в качестве...
Когда я запускаю свою программу, она раздражается:
Значение
= позиция + int(hazards ) ~~~~~~~^^^ TypeError: объект 'int' не подлежит подписке Как бы я сделал, чтобы список опасностей работал внутри функции trap?
liledingcontroller.cs:
public IActionResult Create()
{
CreateListingViewModel model = new()
{
AllCities = new SelectList(_addressRepo.AllCities()),
AllItems = new SelectList(_itemRepo.FindAll(), Id , Name )
};
return View(model);
}