Мне интересно, может ли эта проблема быть связана с тем, как я обрабатываю резюме (которое находится в формате PDF), или это связано с используемым мной входным конвейером. Я попробовал настроить PDF-файл, и, похоже, проблема была решена, но она все еще сохраняется.
Соответствующие части моего кода:
pdf_handling.py
Код: Выделить всё
import argparse
import os
import shutil
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from get_embedding_function import get_embedding_function
from langchain_chroma import Chroma
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--reset", action = "store_true", help = "Reset the database.")
args = parser.parse_args()
if args.reset:
print("✨ Clearing Database")
clear_database()
documents = load_document()
chunks = split_documents(documents)
add_to_chroma(chunks)
def load_document():
document_loader = PyPDFLoader("Resume.pdf")
return document_loader.load()
def split_documents(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 800,
chunk_overlap = 80,
length_function = len,
is_separator_regex = False,
)
return text_splitter.split_documents(documents)
def add_to_chroma(chunks: list[Document]):
db_directory = "my_chroma_data"
db_path = os.path.join(db_directory, "chroma.sqlite3")
os.makedirs(db_directory, exist_ok=True)
db = Chroma(
persist_directory=db_directory,
embedding_function=get_embedding_function()
)
chunks_with_ids = calculate_chunk_ids(chunks)
existing_items = db.get(include=[])
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")
new_chunks = []
for chunk in chunks_with_ids:
if chunk.metadata["id"] not in existing_ids:
new_chunks.append(chunk)
if len(new_chunks):
print(f"👉 Adding new documents: {len(new_chunks)}")
new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
db.add_documents(new_chunks, ids=new_chunk_ids)
else:
print("✅ No new documents to add")
def calculate_chunk_ids(chunks):
last_page_id = None
current_chunk_index = 0
for chunk in chunks:
source = chunk.metadata.get("source")
page = chunk.metadata.get("page")
current_page_id = f"{source}:{page}"
if current_page_id == last_page_id:
current_chunk_index += 1
else:
current_chunk_index = 0
chunk_id = f"{current_page_id}:{current_chunk_index}"
last_page_id = current_page_id
chunk.metadata["id"] = chunk_id
return chunks
def clear_database():
db_path = "my_chroma_data/chroma.sqlite3"
if os.path.exists(db_path):
shutil.rmtree(os.path.dirname(db_path))
print(f"✨ Database at {db_path} has been cleared.")
else:
print("⚠️ No database found to clear.")
if __name__ == "__main__":
main()
Код: Выделить всё
import os
from langchain_chroma import Chroma # Updated import
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from get_embedding_function import get_embedding_function
PROMPT_TEMPLATE = """
You're roleplaying as Maximiliano López Montaño, based on what the resume says.
This is the resume: {context}
---
Answer the question based on the above context: {question}
"""
def main():
print("Welcome to the chatbot! Type 'exit' to quit.")
while True:
query_text = input("You: ")
if query_text.lower() == 'exit':
print("Goodbye!")
break
response = query_rag(query_text)
print(f"AI: {response}")
def query_rag(query_text: str):
embedding_function = get_embedding_function()
persist_directory = "my_chroma_data"
db_path = os.path.join(persist_directory, "chroma.sqlite3")
if not os.path.exists(db_path):
print("⚠️ Database not found. Please run pdf_handling.py to create the database.")
return "No data available."
db = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
results = db.similarity_search_with_score(query_text, k=5)
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
model = Ollama(model="gemma2:2b")
response_text = model.invoke(prompt)
formatted_response = f"{response_text}"
return formatted_response
if __name__ == "__main__":
main()
При необходимости я также могу поделиться PDF-файлом.
Подробнее здесь: https://stackoverflow.com/questions/790 ... a22b-model