Использование Python для преобразования уценки в JSON

Использование Python для преобразования уценки в JSON ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Использование Python для преобразования уценки в JSON

Цитата

Сообщение Anonymous » 13 янв 2025, 13:21

Во-первых, я не программист. Мне нужен конвейер md-json, потому что я готовлю документы для точной настройки LLM. Я думал, что смогу использовать GPT 4o, чтобы написать его для себя, но пока безрезультатно, после четырех дней подряд, когда GPT генерировал новый код и тестировал его локально в VS Code. Написание сценария извлечения кажется довольно простым, но этого не происходит, поэтому я решил обратиться за помощью сюда. Мне нужно обработать много документов, и сделать это вручную невозможно. У меня есть четкие требования: сценарий, который работал нормально, но не оптимально, тестовый файл md, ожидаемый вывод в формате JSON и журналы. Вот код py, а затем ожидаемый результат в формате JSON и тестовый файл md.
PYTHON
import json
import re

def markdown_to_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()

hierarchy = []
stack = []

log_entries = []

def log(message):
print(message)
log_entries.append(message)

def create_section(title, level):
return {"heading_0" + str(level): title, "content": [], "subsections": []}

def add_list_item(content, parent_list):
item = {"point": content, "sub_points": [], "paragraph": ""}
parent_list.append(item)
return item

previous_list_item = None
in_list = False

for line in lines:
line = line.rstrip()

# Match headers
header_match = re.match(r'^(#{1,6})\s+(.*)', line)
if header_match:
level = len(header_match.group(1))
title = header_match.group(2)

# Create a new section
new_section = create_section(title, level)

# Adjust stack for heading levels
while stack and stack[-1].get("level", 0) >= level:
stack.pop()

if stack:
# Add as a sibling if same level
if stack[-1]["level"] == level - 1:
stack[-1]["section"]["subsections"].append(new_section)
else:
# Handle incorrect nesting
while stack and stack[-1]["level"] >= level:
stack.pop()
if stack:
stack[-1]["section"]["subsections"].append(new_section)
else:
hierarchy.append(new_section)
else:
hierarchy.append(new_section)

stack.append({"level": level, "section": new_section})
previous_list_item = None # Reset the previous list item
in_list = False # Reset list tracking

log(f"[LOG] Added Header: {{'level': {level}, 'title': '{title}'}}")
continue

# Match unordered list items
list_match = re.match(r'^(\s*)[-]\s+(.*)', line)
if list_match:
indent_level = len(list_match.group(1))
content = list_match.group(2)

while stack and "list" in stack[-1] and stack[-1]["indent"] >= indent_level:
popped = stack.pop()
log(f"[LOG] Popping Stack: {popped}")

if stack and "list" in stack[-1] and stack[-1]["indent"] < indent_level:
parent_list = stack[-1]["list"][-1]["sub_points"]
new_item = add_list_item(content, parent_list)
stack.append({"list": parent_list, "indent": indent_level, "section": stack[-1].get("section"), "level": stack[-1].get("level", 0)})
previous_list_item = new_item
log(f"[LOG] Added Sublist Item: {{'content': '{content}', 'indent_level': {indent_level}}}")
else:
if stack:
parent_section = stack[-1]["section"]
new_list = []
parent_section["content"].append({"list_type": "unordered", "items": new_list})
new_item = add_list_item(content, new_list)
stack.append({"list": new_list, "indent": indent_level, "section": parent_section, "level": stack[-1].get("level", 0)})
previous_list_item = new_item
log(f"[LOG] Added List Item: {{'content': '{content}', 'indent_level': {indent_level}}}")
in_list = True
continue

# Detect continuation of a list item
continuation_match = re.match(r'^(\s+)(.*)', line)
if in_list and continuation_match:
continuation_indent = len(continuation_match.group(1))
continuation_content = continuation_match.group(2)

if stack:
current_indent = stack[-1].get("indent", 0)
if continuation_indent

Подробнее здесь: https://stackoverflow.com/questions/793 ... wn-to-json

1736763696

Anonymous

Во-первых, я не программист. Мне нужен конвейер md-json, потому что я готовлю документы для точной настройки LLM. Я думал, что смогу использовать GPT 4o, чтобы написать его для себя, но пока безрезультатно, после четырех дней подряд, когда GPT генерировал новый код и тестировал его локально в VS Code. Написание сценария извлечения кажется довольно простым, но этого не происходит, поэтому я решил обратиться за помощью сюда. Мне нужно обработать много документов, и сделать это вручную невозможно. У меня есть четкие требования: сценарий, который работал нормально, но не оптимально, тестовый файл md, ожидаемый вывод в формате JSON и журналы.  Вот код py, а затем ожидаемый результат в формате JSON и тестовый файл md.
PYTHON
import json
import re

def markdown_to_json(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()

hierarchy = []
stack = []

log_entries = []

def log(message):
print(message)
log_entries.append(message)

def create_section(title, level):
return {"heading_0" + str(level): title, "content": [], "subsections": []}

def add_list_item(content, parent_list):
item = {"point": content, "sub_points": [], "paragraph": ""}
parent_list.append(item)
return item

previous_list_item = None
in_list = False

for line in lines:
line = line.rstrip()

# Match headers
header_match = re.match(r'^(#{1,6})\s+(.*)', line)
if header_match:
level = len(header_match.group(1))
title = header_match.group(2)

# Create a new section
new_section = create_section(title, level)

# Adjust stack for heading levels
while stack and stack[-1].get("level", 0) >= level:
stack.pop()

if stack:
# Add as a sibling if same level
if stack[-1]["level"] == level - 1:
stack[-1]["section"]["subsections"].append(new_section)
else:
# Handle incorrect nesting
while stack and stack[-1]["level"] >= level:
stack.pop()
if stack:
stack[-1]["section"]["subsections"].append(new_section)
else:
hierarchy.append(new_section)
else:
hierarchy.append(new_section)

stack.append({"level": level, "section": new_section})
previous_list_item = None  # Reset the previous list item
in_list = False  # Reset list tracking

log(f"[LOG] Added Header: {{'level': {level}, 'title': '{title}'}}")
continue

# Match unordered list items
list_match = re.match(r'^(\s*)[-]\s+(.*)', line)
if list_match:
indent_level = len(list_match.group(1))
content = list_match.group(2)

while stack and "list" in stack[-1] and stack[-1]["indent"] >= indent_level:
popped = stack.pop()
log(f"[LOG] Popping Stack: {popped}")

if stack and "list" in stack[-1] and stack[-1]["indent"] <  indent_level:
parent_list = stack[-1]["list"][-1]["sub_points"]
new_item = add_list_item(content, parent_list)
stack.append({"list": parent_list, "indent": indent_level, "section": stack[-1].get("section"), "level": stack[-1].get("level", 0)})
previous_list_item = new_item
log(f"[LOG] Added Sublist Item: {{'content': '{content}', 'indent_level': {indent_level}}}")
else:
if stack:
parent_section = stack[-1]["section"]
new_list = []
parent_section["content"].append({"list_type": "unordered", "items": new_list})
new_item = add_list_item(content, new_list)
stack.append({"list": new_list, "indent": indent_level, "section": parent_section, "level": stack[-1].get("level", 0)})
previous_list_item = new_item
log(f"[LOG] Added List Item: {{'content': '{content}', 'indent_level': {indent_level}}}")
in_list = True
continue

# Detect continuation of a list item
continuation_match = re.match(r'^(\s+)(.*)', line)
if in_list and continuation_match:
continuation_indent = len(continuation_match.group(1))
continuation_content = continuation_match.group(2)

if stack:
current_indent = stack[-1].get("indent", 0)
if continuation_indent 

Подробнее здесь: [url]https://stackoverflow.com/questions/79351843/using-python-to-convert-markdown-to-json[/url]