У меня около 30 ГБ данных JSON с несколькими файлами, и я хотел создать для них бот-запрос.
Я создал то же самое с текстовым файлом, но не уверен, как это будет работать с данными JSON.
Я изучил JSONLoader, но не знаю, как его использовать для преобразования данных JSON в вектор и сохранения их в ChromaDB, чтобы можно было к ним обращаться.
/>https://python.langchain.com/docs/modul ... aders/json
Пример файла JSON: http://jsonblob.com/1147948130921996288[b]Код для текстовых данных:[/b]
# Loading and Splitting the Documents
from langchain.document_loaders import DirectoryLoader
directory = '/content/drive/MyDrive/Data Science/LLM/docs/text files'
def load_docs(directory):
loader = DirectoryLoader(directory)
documents = loader.load()
return documents
documents = load_docs(directory)
len(documents)
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
docs = split_docs(documents)
print(len(docs))
# Embedding Text Using Langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#Creating Vector Store with Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_db"
vectordb = Chroma.from_documents(
documents=docs, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
#Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
#Extracting Answers from Documents
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Что я пробовал для данных JSON:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
import json
# Define a simple JSON schema (modify as needed)
json_schema = {
}
# Function to validate a JSON document against a schema
def validate_json(json_data, schema):
return all(key in json_data for key in schema.keys())
# 1. Load JSON Files
def load_json_docs(directory):
loader = DirectoryLoader(directory, glob='**/*.json', loader_cls=JSONLoader)
documents = loader.load()
# Manually filter and validate documents based on the JSON schema
valid_documents = []
for doc in documents:
try:
# Parse the JSON content
json_data = json.loads(doc.page_content)
if validate_json(json_data, json_schema):
valid_documents.append(doc)
except json.JSONDecodeError:
pass # Invalid JSON format, skip this document
return valid_documents
directory = '/content/drive/MyDrive/Data Science/LLM/docs/json files'
json_documents = load_json_docs(directory)
len(json_documents)
# 2. Split JSON Documents
def split_json_docs(documents, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
split_json_documents = split_json_docs(json_documents)
print(len(split_json_documents))
# 3. Embedding Text Using Langchain
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# 4. Creating Vector Store with Chroma DB
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_json_db"
vectordb = Chroma.from_documents(
documents=split_json_documents, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
# 5. Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
# 6. Extracting Answers from Documents
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Подробнее здесь: https://stackoverflow.com/questions/770 ... -langchain
Бот-запрос к нескольким файлам JSON в Langchain ⇐ Python
Программы на Python
-
Anonymous
1732021897
Anonymous
У меня около 30 ГБ данных JSON с несколькими файлами, и я хотел создать для них бот-запрос.
Я создал то же самое с текстовым файлом, но не уверен, как это будет работать с данными JSON.
Я изучил JSONLoader, но не знаю, как его использовать для преобразования данных JSON в вектор и сохранения их в ChromaDB, чтобы можно было к ним обращаться.
/>https://python.langchain.com/docs/modules/data_connection/document_loaders/json
Пример файла JSON: http://jsonblob.com/1147948130921996288[b]Код для текстовых данных:[/b]
# Loading and Splitting the Documents
from langchain.document_loaders import DirectoryLoader
directory = '/content/drive/MyDrive/Data Science/LLM/docs/text files'
def load_docs(directory):
loader = DirectoryLoader(directory)
documents = loader.load()
return documents
documents = load_docs(directory)
len(documents)
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
docs = split_docs(documents)
print(len(docs))
# Embedding Text Using Langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#Creating Vector Store with Chroma DB
from langchain.vectorstores import Chroma
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_db"
vectordb = Chroma.from_documents(
documents=docs, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
#Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
#Extracting Answers from Documents
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Что я пробовал для данных JSON:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import JSONLoader
import json
# Define a simple JSON schema (modify as needed)
json_schema = {
}
# Function to validate a JSON document against a schema
def validate_json(json_data, schema):
return all(key in json_data for key in schema.keys())
# 1. Load JSON Files
def load_json_docs(directory):
loader = DirectoryLoader(directory, glob='**/*.json', loader_cls=JSONLoader)
documents = loader.load()
# Manually filter and validate documents based on the JSON schema
valid_documents = []
for doc in documents:
try:
# Parse the JSON content
json_data = json.loads(doc.page_content)
if validate_json(json_data, json_schema):
valid_documents.append(doc)
except json.JSONDecodeError:
pass # Invalid JSON format, skip this document
return valid_documents
directory = '/content/drive/MyDrive/Data Science/LLM/docs/json files'
json_documents = load_json_docs(directory)
len(json_documents)
# 2. Split JSON Documents
def split_json_docs(documents, chunk_size=1000, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents)
return docs
split_json_documents = split_json_docs(json_documents)
print(len(split_json_documents))
# 3. Embedding Text Using Langchain
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# 4. Creating Vector Store with Chroma DB
persist_directory = "/content/drive/MyDrive/Data Science/LLM/docs/chroma_json_db"
vectordb = Chroma.from_documents(
documents=split_json_documents, embedding=embeddings, persist_directory=persist_directory
)
vectordb.persist()
# 5. Using OpenAI Large Language Models (LLM) with Chroma DB
import os
os.environ["OPENAI_API_KEY"] = "sk-your-key"
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)
# 6. Extracting Answers from Documents
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
query = "who is Mr. Jabez Wilson?"
matching_docs = vectordb.similarity_search(query)
answer = chain.run(input_documents=matching_docs, question=query)
answer
Подробнее здесь: [url]https://stackoverflow.com/questions/77031060/query-bot-on-multiple-json-files-on-langchain[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия