I am trying to load a bunch of pdf files and query them using OpenAI APIs.
from langchain.text_splitter import CharacterTextSplitter
#from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle
import os
print("Loading data...")
pdf_folder_path = "content/"
print(os.listdir(pdf_folder_path))
# Load multiple files
# location of the pdf file/files.
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
print(loaders)
alldocument = []
vectorstore = None
for loader in loaders:
print("Loading raw document..." + loader.file_path)
raw_documents = loader.load()
print("Splitting text...")
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=800,
chunk_overlap=100,
length_function=len,
)
documents = text_splitter.split_documents(raw_documents)
#alldocument = alldocument + documents
print("Creating vectorstore...")
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
#with open("vectorstore.pkl", "wb") as f:
with open("vectorstore.pkl", "ab") as f:
pickle.dump(vectorstore, f)
f.close()
I am trying to load multiple files for QnA but the index only remembers the last file uploaded from a folder.
Do I need to change the structure of for loop or have another parameter with the Open Method?
The problem is that with each iteration of the loop, you're overwriting the previous vectorstore when you create a new one. Then, when saving to "vectorstore.pkl", you're only saving the last vectorstore.
print("Loading data...")
pdf_folder_path = "content/"
print(os.listdir(pdf_folder_path))
# Load multiple files
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
print(loaders)
all_documents = []
for loader in loaders:
print("Loading raw document..." + loader.file_path)
raw_documents = loader.load()
print("Splitting text...")
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=800,
chunk_overlap=100,
length_function=len,
)
documents = text_splitter.split_documents(raw_documents)
all_documents.extend(documents)
print("Creating vectorstore...")
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(all_documents, embeddings)
with open("vectorstore.pkl", "wb") as f:
pickle.dump(vectorstore, f)