Как улучшить результаты? - Цифровое Кемерово

Как улучшить результаты? ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 16 янв 2025, 14:00

Сейчас пишу программу, в которой мне нужно извлечь информацию из тряпки. эта информация затем должна быть использована фильмом. Я использую FAISS в среде Python с оболочкой Langchain.
Источником данных является документ с закономерностями, который я разбиваю на отдельные тексты на основе абзацев. это позволяет мне гарантировать, что текст будет последовательным для каждой темы.
Однако теперь у меня возникла проблема: база данных дает мне довольно странные результаты. если я спрошу в базе данных (например, обс темы), как почистить банан, я получу результаты, касающиеся, например, лучшего способа посадки киви.
соответственно, результаты не очень мне помогают, и мне интересно, как я могу их улучшить.
вот мой код:
ввод метода поиска в основном состоит из 1 предложения, содержащего вопрос.
метод поиска должен вернуть один или два документа.
import logging
import os
from pathlib import Path

import PyPDF2
from langchain_core.documents import Document

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

logging.basicConfig(
level=logging.INFO,
filename="logs/api.log",
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

class FaissConnection:
_instance = None

def __new__(cls):
if cls._instance is None:
cls._instance = super(FaissConnection, cls).__new__(cls)
cls._instance._initialize()
return cls._instance

def _initialize(self):
"""Initializes the FAISS connection, loading and processing the PDF."""

# Load and filter documents
character_chunks += self.get_regulation_chunks()

self.embeddings = HuggingFaceEmbeddings()

logging.info("Text split into %d chunks successfully.", len(character_chunks))

# Create FAISS index
self.db = FAISS.from_documents(character_chunks, self.embeddings)
logging.info("FAISS index created successfully.")

@staticmethod
def get_regulation_chunks() -> list[Document]:
"""Returns the regulation documents."""
documents = FaissConnection.get_regulation_documents()
logging.info("Text extracted from PDF file successfully. Total pages: %d", len(documents))

text_splitter = CharacterTextSplitter(separator="\n§")
character_chunks = text_splitter.split_documents(documents)

return character_chunks

@staticmethod
def get_regulation_documents() -> list[Document]:
"""Returns the regulation documents."""
current_file = Path(__file__).resolve()
project_root = current_file.parents[2]
pdf_path = project_root / "resources" / "document.pdf"

if not pdf_path or not os.path.exists(pdf_path):
raise FileNotFoundError("the file does not exist.")

documents = FaissConnection.load_pdf_from_file(pdf_path)
# filter all docs with less than 100 characters
documents = [doc for doc in documents if len(doc.page_content) > 100]
return documents

@staticmethod
def load_pdf_from_file(file_path: str) -> list[Document]:
"""Loads text from a PDF file."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
documents = []
reader = PyPDF2.PdfReader(file_path)
progress_bar = tqdm(range(len(reader.pages)), desc="Reading PDF pages")
for page_num in progress_bar:
page = reader.pages[page_num]
text = page.extract_text()
document = Document(page_content=text)
documents.append(document)
return documents

def search(self, query, return_amount=1):
"""
Searches the FAISS index with the given query and returns the most relevant documents.

Args:
query (str): The search query.
return_amount (int): Number of documents to return.

Returns:
list[Document]: List of relevant documents.
"""
retriever = self.db.as_retriever(search_type="mmr")
retriever.search_kwargs["k"] = return_amount # Limit results
#docs = retriever.get_relevant_documents(query)
docs = retriever.invoke(query) #TODO:test difference
logging.info("Search query executed. Returning top %d result(s).", return_amount)
for doc in docs:
logging.info("Document: %s", doc.page_content)
return docs[0] if return_amount == 1 else docs

if __name__ == "__main__":
# Create the singleton instance
faiss_instance = FaissConnection()

# Example of using the singleton instance to retrieve relevant documents
relevant_docs = faiss_instance.search("How to peel a Banana?", return_amount=2)

Подробнее здесь: https://stackoverflow.com/questions/793 ... ss-results

1737025229

Anonymous

Сейчас пишу программу, в которой мне нужно извлечь информацию из тряпки. эта информация затем должна быть использована фильмом. Я использую FAISS в среде Python с оболочкой Langchain.
Источником данных является документ с закономерностями, который я разбиваю на отдельные тексты на основе абзацев. это позволяет мне гарантировать, что текст будет последовательным для каждой темы.
Однако теперь у меня возникла проблема: база данных дает мне довольно странные результаты. если я спрошу в базе данных (например, обс темы), как почистить банан, я получу результаты, касающиеся, например, лучшего способа посадки киви.
соответственно, результаты не очень мне помогают, и мне интересно, как я могу их улучшить.
вот мой код:
ввод метода поиска в основном состоит из 1 предложения, содержащего вопрос.
метод поиска должен вернуть один или два документа.
import logging
import os
from pathlib import Path

import PyPDF2
from langchain_core.documents import Document

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm

logging.basicConfig(
level=logging.INFO,
filename="logs/api.log",
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

class FaissConnection:
_instance = None

def __new__(cls):
if cls._instance is None:
cls._instance = super(FaissConnection, cls).__new__(cls)
cls._instance._initialize()
return cls._instance

def _initialize(self):
"""Initializes the FAISS connection, loading and processing the PDF."""

# Load and filter documents
character_chunks += self.get_regulation_chunks()

self.embeddings = HuggingFaceEmbeddings()

logging.info("Text split into %d chunks successfully.", len(character_chunks))

# Create FAISS index
self.db = FAISS.from_documents(character_chunks, self.embeddings)
logging.info("FAISS index created successfully.")

@staticmethod
def get_regulation_chunks() -> list[Document]:
"""Returns the regulation documents."""
documents = FaissConnection.get_regulation_documents()
logging.info("Text extracted from PDF file successfully.  Total pages: %d", len(documents))

text_splitter = CharacterTextSplitter(separator="\n§")
character_chunks = text_splitter.split_documents(documents)

return character_chunks

@staticmethod
def get_regulation_documents() -> list[Document]:
"""Returns the regulation documents."""
current_file = Path(__file__).resolve()
project_root = current_file.parents[2]
pdf_path = project_root / "resources" / "document.pdf"

if not pdf_path or not os.path.exists(pdf_path):
raise FileNotFoundError("the file does not exist.")

documents = FaissConnection.load_pdf_from_file(pdf_path)
# filter all docs with less than 100 characters
documents = [doc for doc in documents if len(doc.page_content) > 100]
return documents

@staticmethod
def load_pdf_from_file(file_path: str) -> list[Document]:
"""Loads text from a PDF file."""
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
documents = []
reader = PyPDF2.PdfReader(file_path)
progress_bar = tqdm(range(len(reader.pages)), desc="Reading PDF pages")
for page_num in progress_bar:
page = reader.pages[page_num]
text = page.extract_text()
document = Document(page_content=text)
documents.append(document)
return documents

def search(self, query, return_amount=1):
"""
Searches the FAISS index with the given query and returns the most relevant documents.

Args:
query (str): The search query.
return_amount (int): Number of documents to return.

Returns:
list[Document]: List of relevant documents.
"""
retriever = self.db.as_retriever(search_type="mmr")
retriever.search_kwargs["k"] = return_amount  # Limit results
#docs = retriever.get_relevant_documents(query)
docs = retriever.invoke(query) #TODO:test difference
logging.info("Search query executed. Returning top %d result(s).", return_amount)
for doc in docs:
logging.info("Document: %s", doc.page_content)
return docs[0] if return_amount == 1 else docs

if __name__ == "__main__":
# Create the singleton instance
faiss_instance = FaissConnection()

# Example of using the singleton instance to retrieve relevant documents
relevant_docs = faiss_instance.search("How to peel a Banana?", return_amount=2)

 

Подробнее здесь: [url]https://stackoverflow.com/questions/79361321/how-to-improve-faiss-results[/url]