Рекомендации по диете с помощью анализатора отчетов по крови (агент AI)

Рекомендации по диете с помощью анализатора отчетов по крови (агент AI) ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 11 фев 2026, 22:55

Я пытаюсь создать агент для этого конкретного сценария: «Пользователь должен загрузить отчет о крови на платформу, и на основе отчета агент должен рекомендовать выбор продуктов питания, а затем заказать их на Swiggy/Zomato».
Я пытаюсь заставить агента сначала анализировать отчет о крови, конвертировать подробные данные в формат json, а затем передавать подробности в LLM (только те тесты, которые выходят за пределы контрольного диапазона). но я не могу правильно разобрать весь PDF-файл. Ниже приведен код:
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
import json

def extract_text(file_path):
text = ""

if file_path.lower().endswith(".pdf"):
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"

# If empty → fallback to OCR
if not text.strip():
images = convert_from_path(file_path)
for img in images:
text += pytesseract.image_to_string(img)

elif file_path.lower().endswith((".png", ".jpg", ".jpeg")):
image = Image.open(file_path)
text = pytesseract.image_to_string(image)

return text

def clean_text(raw_text):

# Remove repeated headers
raw_text = re.sub(r"Scan QR code to check.*?LABORATORY TEST REPORT", "", raw_text, flags=re.DOTALL)

# Remove page numbers
raw_text = re.sub(r"Page \d+ of \d+", "", raw_text)

# Remove doctor signature blocks
raw_text = re.sub(r"Dr\..*?MD Path", "", raw_text)

# Remove authentication lines
raw_text = re.sub(r"This is an Electronically Authenticated Report.*", "", raw_text)

# Remove weird reversed OCR words
raw_text = re.sub(r"\b[a-zA-Z]+-[a-zA-Z]+\b", "", raw_text)

# Remove extra whitespace
raw_text = re.sub(r"\n\s*\n", "\n", raw_text)

return raw_text.strip()

def extract_patient_info(text):

name_match = re.search(r"Name\s*:\s*(.+)", text)
sex_age_match = re.search(r"Sex/Age\s*:\s*(.+)", text)

patient_info = {
"name": name_match.group(1).strip() if name_match else None,
"sex_age": sex_age_match.group(1).strip() if sex_age_match else None
}

return patient_info

def extract_biomarkers(text):

biomarkers = []

lines = text.split("\n")

pattern = re.compile(
r"^(?P[A-Za-z0-9\(\)\-\s\/]+?)\s+"
r"(?P[HL]?)\s*"
r"(?P\d+\.?\d*)\s*"
r"(?P[a-zA-Z\/%\-\+\(\)]+)?\s*"
r"(?P\d+\.?\d*\s*-\s*\d+\.?\d*|\s*\d+\.?\d*)?"
)

for line in lines:
match = pattern.search(line)
if match:
biomarker = {
"name": match.group("name").strip(),
"value": float(match.group("value")),
"unit": match.group("unit"),
"reference_range": match.group("range"),
"flag": match.group("flag")
}
biomarkers.append(biomarker)

return biomarkers

def compute_status(biomarkers):

for marker in biomarkers:

if marker["flag"] == "H":
marker["status"] = "high"
continue

if marker["flag"] == "L":
marker["status"] = "low"
continue

ref = marker["reference_range"]

if not ref:
marker["status"] = "unknown"
continue

try:
value = marker["value"]

if "-" in ref:
low, high = map(float, re.findall(r"\d+\.?\d*", ref))
if value < low:
marker["status"] = "low"
elif value > high:
marker["status"] = "high"
else:
marker["status"] = "normal"

elif "" in ref:
limit = float(re.findall(r"\d+\.?\d*", ref)[0])
marker["status"] = "low" if value

Подробнее здесь: https://stackoverflow.com/questions/798 ... r-ai-agent

1770839757

Anonymous

Я пытаюсь создать агент для этого конкретного сценария: «Пользователь должен загрузить отчет о крови на платформу, и на основе отчета агент должен рекомендовать выбор продуктов питания, а затем заказать их на Swiggy/Zomato».
Я пытаюсь заставить агента сначала анализировать отчет о крови, конвертировать подробные данные в формат json, а затем передавать подробности в LLM (только те тесты, которые выходят за пределы контрольного диапазона). но я не могу правильно разобрать весь PDF-файл.  Ниже приведен код:
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import re
import json

def extract_text(file_path):
text = ""

if file_path.lower().endswith(".pdf"):
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"

# If empty → fallback to OCR
if not text.strip():
images = convert_from_path(file_path)
for img in images:
text += pytesseract.image_to_string(img)

elif file_path.lower().endswith((".png", ".jpg", ".jpeg")):
image = Image.open(file_path)
text = pytesseract.image_to_string(image)

return text

def clean_text(raw_text):

# Remove repeated headers
raw_text = re.sub(r"Scan QR code to check.*?LABORATORY TEST REPORT", "", raw_text, flags=re.DOTALL)

# Remove page numbers
raw_text = re.sub(r"Page \d+ of \d+", "", raw_text)

# Remove doctor signature blocks
raw_text = re.sub(r"Dr\..*?MD Path", "", raw_text)

# Remove authentication lines
raw_text = re.sub(r"This is an Electronically Authenticated Report.*", "", raw_text)

# Remove weird reversed OCR words
raw_text = re.sub(r"\b[a-zA-Z]+-[a-zA-Z]+\b", "", raw_text)

# Remove extra whitespace
raw_text = re.sub(r"\n\s*\n", "\n", raw_text)

return raw_text.strip()

def extract_patient_info(text):

name_match = re.search(r"Name\s*:\s*(.+)", text)
sex_age_match = re.search(r"Sex/Age\s*:\s*(.+)", text)

patient_info = {
"name": name_match.group(1).strip() if name_match else None,
"sex_age": sex_age_match.group(1).strip() if sex_age_match else None
}

return patient_info

def extract_biomarkers(text):

biomarkers = []

lines = text.split("\n")

pattern = re.compile(
r"^(?P[A-Za-z0-9\(\)\-\s\/]+?)\s+"
r"(?P[HL]?)\s*"
r"(?P\d+\.?\d*)\s*"
r"(?P[a-zA-Z\/%\-\+\(\)]+)?\s*"
r"(?P\d+\.?\d*\s*-\s*\d+\.?\d*|\s*\d+\.?\d*)?"
)

for line in lines:
match = pattern.search(line)
if match:
biomarker = {
"name": match.group("name").strip(),
"value": float(match.group("value")),
"unit": match.group("unit"),
"reference_range": match.group("range"),
"flag": match.group("flag")
}
biomarkers.append(biomarker)

return biomarkers

def compute_status(biomarkers):

for marker in biomarkers:

if marker["flag"] == "H":
marker["status"] = "high"
continue

if marker["flag"] == "L":
marker["status"] = "low"
continue

ref = marker["reference_range"]

if not ref:
marker["status"] = "unknown"
continue

try:
value = marker["value"]

if "-" in ref:
low, high = map(float, re.findall(r"\d+\.?\d*", ref))
if value < low:
marker["status"] = "low"
elif value > high:
marker["status"] = "high"
else:
marker["status"] = "normal"

elif ""  in ref:
limit = float(re.findall(r"\d+\.?\d*", ref)[0])
marker["status"] = "low" if value 

Подробнее здесь: [url]https://stackoverflow.com/questions/79887680/diet-recommendation-via-blood-report-analyzer-ai-agent[/url]