Чат-бот RAG не отвечает на перефразированные вопросы

Чат-бот RAG не отвечает на перефразированные вопросы ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Чат-бот RAG не отвечает на перефразированные вопросы

Цитата

Сообщение Anonymous » 14 фев 2026, 16:55

Я создал чат-бота RAG на Python, langchain и FAISS для векторного магазина.
И данные хранятся в формате JSON.
Чат-бот иногда отказывается отвечать, когда вопрос перефразируется.
Вот два разговора:
Первый разговор:
пользователь: Что изучают мастера ИИ план?
помощник:The Master of Science in Artificial Intelligence (MSAI) study plan involves ...etc.
Второй разговор:
пользователь: каков учебный план мастеров ИИ?
ассистент: Извините, мне не удалось найти эту информацию.
Мы ясно видим, что оба вопроса имеют одно и то же значение.
Данные вопросов выше были просканированы, обработаны в конвейере RAG и сохраняется как JSON.
Я пробовал изменить уровень фрагментации и изменить модель с all-mini на all-mpnet.
И та же логическая ошибка все еще возникает.
Crawl.py:
def crawl_study_plan(study_plan_link: str, year: int):
"""Crawl a single study plan (HTML table or PDF) and return structured JSON."""
try:
if study_plan_link.lower().endswith(".pdf"):
text = extract_text_from_pdf(study_plan_link)
return {"year": year, "link": study_plan_link, "text": translate_if_not_english(text)}

response = requests.get(study_plan_link, headers=HEADERS)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

title_tag = soup.select_one("h1.text-primary")
title = translate_if_not_english(title_tag.get_text(strip=True)) if title_tag else f"Study Plan {year}"

table = soup.select_one("div.table-responsive table")
semesters = parse_study_plan_table(table, study_plan_link) if table else {}

return {"year": year, "link": study_plan_link, "title": title, "semesters": semesters}
except Exception as e:
print(f"[!] Failed to crawl study plan {study_plan_link}: {e}")
return {"year": year, "link": study_plan_link, "title": "", "semesters": {}}

Data.py:
def split_text_chunks(documents):
"""Split documents into chunks for better vector search."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=1200, # Increased chunk size for structured data
chunk_overlap=200, # Increased overlap for better context
separators=["\n\n", "\n", ".", "!", "?", " "]
)
chunks = splitter.split_documents(documents)
print(f"

Created {len(chunks)} text chunks.")
return chunks

def load_crawled_json_data():
"""Load all JSON data from the crawled structured data."""
documents = []
# Load major details
major_details_dir = os.path.join(DATA_DIR, "major_details")
if os.path.exists(major_details_dir):
for filename in os.listdir(major_details_dir):
if filename.endswith('.json'):
try:
with open(os.path.join(major_details_dir, filename), 'r', encoding='utf-8') as f:
detail_data = json.load(f)
content = f"Major Details ({filename}):\n{json.dumps(detail_data, indent=2, ensure_ascii=False)}"
documents.append(Document(
page_content=content,
metadata={"source": filename, "type": "major_details"}
))
print(f"

Loaded major details from {filename}")
except Exception as e:
print(f"

Failed to load major details from {filename}: {e}")
return documents

Подробнее здесь: https://stackoverflow.com/questions/797 ... -questions

1771077352

Anonymous

Я создал чат-бота RAG на Python, langchain и FAISS для векторного магазина.
И данные хранятся в формате JSON.
Чат-бот иногда отказывается отвечать, когда вопрос перефразируется.
Вот два разговора:
Первый разговор:
[b]пользователь[/b]: Что изучают мастера ИИ план?
[b]помощник[/b]:The Master of Science in Artificial Intelligence (MSAI) study plan involves ...etc.
Второй разговор:
[b]пользователь[/b]: каков учебный план мастеров ИИ?
[b]ассистент[/b]: Извините, мне не удалось найти эту информацию.
Мы ясно видим, что оба вопроса имеют одно и то же значение.
Данные вопросов выше были просканированы, обработаны в конвейере RAG и сохраняется как JSON.
Я пробовал изменить уровень фрагментации и изменить модель с all-mini на all-mpnet.
И та же логическая ошибка все еще возникает.
Crawl.py:
def crawl_study_plan(study_plan_link: str, year: int):
"""Crawl a single study plan (HTML table or PDF) and return structured JSON."""
try:
if study_plan_link.lower().endswith(".pdf"):
text = extract_text_from_pdf(study_plan_link)
return {"year": year, "link": study_plan_link, "text": translate_if_not_english(text)}

response = requests.get(study_plan_link, headers=HEADERS)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

title_tag = soup.select_one("h1.text-primary")
title = translate_if_not_english(title_tag.get_text(strip=True)) if title_tag else f"Study Plan {year}"

table = soup.select_one("div.table-responsive table")
semesters = parse_study_plan_table(table, study_plan_link) if table else {}

return {"year": year, "link": study_plan_link, "title": title, "semesters": semesters}
except Exception as e:
print(f"[!] Failed to crawl study plan {study_plan_link}: {e}")
return {"year": year, "link": study_plan_link, "title": "", "semesters": {}}

Data.py:
def split_text_chunks(documents):
"""Split documents into chunks for better vector search."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=1200,  # Increased chunk size for structured data
chunk_overlap=200,  # Increased overlap for better context
separators=["\n\n", "\n", ".", "!", "?", " "]
)
chunks = splitter.split_documents(documents)
print(f"🧩 Created {len(chunks)} text chunks.")
return chunks

def load_crawled_json_data():
"""Load all JSON data from the crawled structured data."""
documents = []
# Load major details
major_details_dir = os.path.join(DATA_DIR, "major_details")
if os.path.exists(major_details_dir):
for filename in os.listdir(major_details_dir):
if filename.endswith('.json'):
try:
with open(os.path.join(major_details_dir, filename), 'r', encoding='utf-8') as f:
detail_data = json.load(f)
content = f"Major Details ({filename}):\n{json.dumps(detail_data, indent=2, ensure_ascii=False)}"
documents.append(Document(
page_content=content,
metadata={"source": filename, "type": "major_details"}
))
print(f"📖 Loaded major details from {filename}")
except Exception as e:
print(f"⚠️ Failed to load major details from {filename}: {e}")
return documents
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79770341/rag-chatbot-does-not-answer-paraphrased-questions[/url]