Spring Batch Tasklet для проверки файлов – требуется обработка исключенийPython

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Spring Batch Tasklet для проверки файлов – требуется обработка исключений

Сообщение Anonymous »

Я работаю над приложением Spring Batch и реализовал собственный Partitioner, чтобы разделить обработку на несколько потоков. У меня возникли проблемы с написанием теста JUnit 4 для моего partition метод. Я издевался над своим DataSource и JdbcTemplate, но я не уверен, что я утверждаю правильные условия или правильна ли моя настройка.
Этот код проверяет количество вхождений чего-либо, что может быть чем угодно, от записей до события, на которые ссылается метод dto (объект передачи данных). Вот что делает логика, описанная простыми словами:
Код запрашивает у dto количество вхождений с помощью метода dto.getNumOfOccur().
Если количество вхождений равно нулю (это означает, что вхождений не было), код устанавливает для dto статус «НОВЫЙ», указывая, что все, что проверяется, является новым или не происходило ранее.
Если какое-либо число больше нуля (это означает, что произошло хотя бы одно событие), статус устанавливается на «СУЩЕСТВУЕТ», что указывает на то, что событие не является новым и случалось раньше.
Этот статус устанавливается с помощью метода dto.setNeworexist() с указанием «NEW» или «EXISTS» в качестве
import streamlit as st
import pytesseract
from PIL import Image
import fitz # PyMuPDF
import spacy
import pandas as pd
import re
import os
import io
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path, lang='eng'):
"""Extract text from an image file using OCR."""
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang=lang)
return text

def extract_text_from_pdf(pdf_path, lang='eng'):
"""Extract text from a PDF file."""
document = fitz.open(pdf_path)
text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text("text")
return text

def categorize_text(text):
"""Categorize the extracted text into different types of insurance documents."""
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Define keywords and entities for categorization
invoice_keywords = ["invoice", "bill"]
claim_notice_keywords = ["account number", "insurance payee"]
receipt_keywords = ["receipt", "payment details"]
tax_certificate_keywords = ["tax certificate", "policy no"]
renewal_notice_keywords = ["renewal notice", "policy number"]
premium_receipt_keywords = ["premium receipt", "receipt number"]

text_lower = text.lower()

invoice_count = sum(text_lower.count(keyword) for keyword in invoice_keywords)
claim_notice_count = sum(text_lower.count(keyword) for keyword in claim_notice_keywords)
receipt_count = sum(text_lower.count(keyword) for keyword in receipt_keywords)
tax_certificate_count = sum(text_lower.count(keyword) for keyword in tax_certificate_keywords)
renewal_notice_count = sum(text_lower.count(keyword) for keyword in renewal_notice_keywords)
premium_receipt_count = sum(text_lower.count(keyword) for keyword in premium_receipt_keywords)

if claim_notice_count > invoice_count and claim_notice_count > receipt_count and claim_notice_count > tax_certificate_count and claim_notice_count > renewal_notice_count and claim_notice_count > premium_receipt_count:
return "Claim Notice"
elif invoice_count > claim_notice_count and invoice_count > receipt_count and invoice_count > tax_certificate_count and invoice_count > renewal_notice_count and invoice_count > premium_receipt_count:
return "Invoice"
elif receipt_count > claim_notice_count and receipt_count > invoice_count and receipt_count > tax_certificate_count and receipt_count > renewal_notice_count and receipt_count > premium_receipt_count:
return "Payment Receipt"
elif tax_certificate_count > claim_notice_count and tax_certificate_count > invoice_count and tax_certificate_count > receipt_count and tax_certificate_count > renewal_notice_count and tax_certificate_count > premium_receipt_count:
return "Tax Certificate"
elif renewal_notice_count > claim_notice_count and renewal_notice_count > invoice_count and renewal_notice_count > receipt_count and renewal_notice_count > tax_certificate_count and renewal_notice_count > premium_receipt_count:
return "Renewal Notice"
else:
return "Premium Receipt"

def extract_details(text, category):
"""Extract relevant details from the text based on the document category."""
details = {}
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

if category == "Claim Notice":
details['Account Number'] = extract_value_using_regex(r'Account Number\s*:\s*(\S+)', text)
details['Insurance Payee'] = extract_value_using_regex(r'Insurance Payee\s*:\s*(.+)', text)
elif category == "Invoice":
details['Invoice Number'] = extract_value_using_regex(r'Invoice Number\s*:\s*(\S+)', text)
details['Amount'] = extract_value_using_regex(r'Amount\s*:\s*([\d,]+)', text)
elif category == "Payment Receipt":
details['Receipt Number'] = extract_value_using_regex(r'Receipt Number\s*:\s*(\S+)', text)
details['Amount'] = extract_value_using_regex(r'Amount\s*:\s*([\d,]+)', text)
elif category == "Tax Certificate":
details['Policy No'] = extract_value_using_regex(r'Policy No\s*:\s*(\S+)', text)
details['Insured Name'] = extract_value_using_regex(r'Insured Name\s*:\s*(.+)', text)
elif category == "Renewal Notice":
details['Policy Number'] = extract_value_using_regex(r'Policy Number\s*:\s*(\S+)', text)
details['Due Date'] = extract_value_using_regex(r'Due Date\s*:\s*(\S+)', text)
elif category == "Premium Receipt":
details['Receipt Number'] = extract_value_using_regex(r'Receipt Number\s*:\s*(\S+)', text)
details['Premium Amount'] = extract_value_using_regex(r'Premium Amount\s*:\s*([\d,]+)', text)

# Use spaCy's NER to extract named entities
for ent in doc.ents:
if ent.label_ in ["DATE", "MONEY"]:
details[ent.label_] = ent.text

# Use pandas to extract tabular data (example for invoices)
if category in ["Invoice", "Payment Receipt"]:
try:
df = pd.read_csv(io.StringIO(text), sep="\s+", header=None, engine='python')
details['Table'] = df.to_dict()
except pd.errors.ParserError:
details['Table'] = "Could not parse table data."

return details

def extract_value_using_regex(pattern, text):
match = re.search(pattern, text)
if match:
return match.group(1)
return "Not available"

def write_details_to_file(details, category, output_file):
"""Write extracted details to a text file."""
with open(output_file, "w") as f:
f.write(f"{category}\n\n")
for key, value in details.items():
if key == 'Table' and isinstance(value, dict):
f.write(f"\n{key}:\n{pd.DataFrame(value).to_string(index=False)}\n")
else:
f.write(f"{key}: {value}\n")

def process_document(file_path):
"""Process a document (PDF or image) and categorize and extract details."""
if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
text = extract_text_from_image(file_path)
elif file_path.lower().endswith('.pdf'):
text = extract_text_from_pdf(file_path)
else:
raise ValueError("Unsupported file format")

category = categorize_text(text)
details = extract_details(text, category)

output_file = os.path.splitext(file_path)[0] + "_output.txt"
write_details_to_file(details, category, output_file)
return category, details, output_file

def generate_report(details, output_file, report_file):
"""Generate a detailed report of the extracted details."""
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Document Processing Report", ln=True, align='C')
pdf.ln(10)

for key, value in details.items():
if key == 'Table' and isinstance(value, dict):
pdf.cell(200, 10, txt=f"{key}:", ln=True, align='L')
df = pd.DataFrame(value)
pdf.ln(5)
for i in range(len(df)):
pdf.cell(200, 10, txt=df.iloc.to_string(), ln=True, align='L')
else:
pdf.cell(200, 10, txt=f"{key}: {value}", ln=True, align='L')
pdf.ln(5)

pdf.output(report_file)
return report_file

def generate_visualizations(details):
"""Generate visualizations based on the extracted details."""
fig, ax = plt.subplots(figsize=(10, 6))

if 'Amount' in details:
amounts = details['Amount']
sns.barplot(x=list(amounts.keys()), y=list(amounts.values()), ax=ax)
ax.set_title('Amounts by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Amount')
elif 'Table' in details:
df = pd.DataFrame(details['Table'])
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
ax.set_title('Correlation Matrix')

st.pyplot(fig)

# Initialize session state
if 'page' not in st.session_state:
st.session_state.page = 'upload'

def show_upload_page():
st.markdown("""

Изображение
📄 Document Processor

""", unsafe_allow_html=True)

st.markdown("
Upload a PDF or image file to extract and categorize its contents.
", unsafe_allow_html=True)

st.markdown("", unsafe_allow_html=True)
uploaded_file = st.file_uploader("", type=["pdf", "png", "jpg", "jpeg"], help="Upload PDF, PNG, JPG, or JPEG files", label_visibility='collapsed')
st.markdown("", unsafe_allow_html=True)

if uploaded_file is not None:
st.markdown("showLoader();", unsafe_allow_html=True)

file_path = f"temp_{uploaded_file.name}"
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())

try:
category, details, output_file = process_document(file_path)
st.session_state.category = category
st.session_state.details = details
st.session_state.output_file = output_file
st.session_state.page = 'result'
st.experimental_rerun()
except Exception as e:
st.markdown("hideLoader();", unsafe_allow_html=True)
st.error(f"An error occurred: {e}")

def show_result_page():
st.markdown("hideLoader();", unsafe_allow_html=True)
st.success(f"Processed document categorized as **{st.session_state.category}**")

st.markdown("Extracted Details", unsafe_allow_html=True)

col1, col2 = st.columns(2)
with col1:
st.json(st.session_state.details)
generate_visualizations(st.session_state.details)
with col2:
st.markdown("### Download Extracted Details")
with open(st.session_state.output_file, "r") as f:
st.download_button("Download Output File", f, file_name=os.path.basename(st.session_state.output_file), help="Click to download the extracted details as a text file")

report_file = os.path.splitext(st.session_state.output_file)[0] + "_report.pdf"
report_file = generate_report(st.session_state.details, st.session_state.output_file, report_file)
with open(report_file, "rb") as f:
st.download_button("Download Report", f, file_name=os.path.basename(report_file), help="Click to download the report as a PDF file")

# Clean up the temporary file
os.remove(st.session_state.output_file)

# Render the appropriate page based on session state
if st.session_state.page == 'upload':
show_upload_page()
elif st.session_state.page == 'result':
show_result_page()

# Custom CSS and JavaScript for animations
st.markdown(
"""

.main {
background-image: url('https://images.unsplash.com/photo-16025 ... d3ad70ae56');
background-size: cover;
background-position: center;
font-family: 'Arial', sans-serif;
}
.header {
background-color: rgba(76, 175, 80, 0.8);
padding: 20px;
border-radius: 10px;
text-align: center;
color: white;
margin-bottom: 30px;
animation: fadeIn 1s ease-in-out;
}
.header h1 {
margin: 0;
font-size: 2.5rem;
display: inline-block;
vertical-align: middle;
}
.header img {
height: 50px;
margin-right: 10px;
vertical-align: middle;
}
.upload-section {
background-color: rgba(255, 255, 255, 0.8);
padding: 20px;
border-radius: 10px;
box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
text-align: center;
margin-bottom: 30px;
animation: fadeInUp 1s ease-in-out;
}
.upload-section label {
font-size: 1.2rem;
color: #333;
}
.upload-section .stFileUploader {
margin: 20px 0;
}
.stButton button {
background-color: #4CAF50;
color: white;
border: none;
padding: 10px 20px;
text-align: center;
text-decoration: none;
display: inline-block;
font-size: 16px;
margin: 4px 2px;
cursor: pointer;
border-radius: 16px;
transition: background-color 0.3s, transform 0.3s;
}
.stButton button:hover {
background-color: #45a049;
transform: scale(1.05);
}
.stSpinner {
display: flex;
justify-content: center;
margin-top: 20px;
}
.results {
background-color: rgba(255, 255, 255, 0.8);
padding: 20px;
border-radius: 10px;
box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
text-align: center;
animation: fadeInUp 1s ease-in-out;
}
.results h2 {
margin-top: 0;
}
.results .stJson {
text-align: left;
}
.download-section {
text-align: center;
margin-top: 20px;
}
.full-screen-overlay {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(255, 255, 255, 0.9);
z-index: 1000;
display: none;
justify-content: center;
align-items: center;
animation: fadeIn 0.5s ease-in-out;
}
.full-screen-overlay.show {
display: flex;
}
.full-screen-loader {
border: 16px solid #f3f3f3;
border-radius: 50%;
border-top: 16px solid #4CAF50;
width: 120px;
height: 120px;
animation: spin 2s linear infinite;
}
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
@keyframes fadeIn {
0% { opacity: 0; }
100% { opacity: 1; }
}
@keyframes fadeInUp {
0% { opacity: 0; transform: translateY(20px); }
100% { opacity: 1; transform: translateY(0); }
}


function showLoader() {
document.getElementById('full-screen-overlay').classList.add('show');
}
function hideLoader() {
document.getElementById('full-screen-overlay').classList.remove('show');
}

""",
unsafe_allow_html=True,
)

# Full-screen loader overlay
st.markdown("""



""", unsafe_allow_html=True)


Подробнее здесь: https://stackoverflow.com/questions/778 ... n-handling
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»