I am extracting text from pdf documents and load it to Azure Cognitive Search for a RAG approach. Unfortunately this does not work. I am receiving the error message
AttributeError: 'str' object has no attribute 'page_content'
What I want to do is
vectors
and `filename``This is my code:
!pip install cohere tiktoken
!pip install openai==0.28.1
!pip install pymupdf
!pip install azure-storage-blob azure-identity
!pip install azure-search-documents --pre --upgrade
!pip install langchain
import fitz
import time
import uuid
import os
import openai
from PIL import Image
from io import BytesIO
from IPython.display import display
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import AzureSearch
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from google.colab import drive
OPENAI_API_BASE = "https://xxx.openai.azure.com"
OPENAI_API_KEY = "xxx"
OPENAI_API_VERSION = "2023-05-15"
openai.api_type = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION
AZURE_COGNITIVE_SEARCH_SERVICE_NAME = "https://xxx.search.windows.net"
AZURE_COGNITIVE_SEARCH_API_KEY = "xxx"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "test"
llm = AzureChatOpenAI(deployment_name="gpt35", openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
embeddings = OpenAIEmbeddings(deployment_id="ada002", chunk_size=1, openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
acs = AzureSearch(azure_search_endpoint=AZURE_COGNITIVE_SEARCH_SERVICE_NAME,
azure_search_key = AZURE_COGNITIVE_SEARCH_API_KEY,
index_name = AZURE_COGNITIVE_SEARCH_INDEX_NAME,
embedding_function = embeddings.embed_query)
def generate_tokens(s):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_text(s)
return splits
drive.mount('/content/drive')
folder = "/content/drive/.../pdf/"
page_content = ''
doc_content = ''
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
if os.path.isfile(file_path):
print(f"Processing file: {file_path}")
doc = fitz.open(file_path)
for page in doc: # iterate the document pages
page_content += page.get_text() # get plain text encoded as UTF-8
doc_content += page_content
d = generate_tokens(doc_content)
# the following line throws the error
# how can i add the chunks + filename to
# Azure Cognitive Search?
acs.add_documents(documents=d)
print(metadatas)
print("----------")
print(doc_content)
count = len(doc_content.split())
print("Number of tokens: ", count)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-33-d9eaff7ee027> in <cell line: 10>()
31 all_texts.extend(d)
32
---> 33 acs.add_documents(documents=d)
34
35 metadatas = [{"Source": f"{i}-pl"} for i in range(len(all_texts))]
1 frames
/usr/local/lib/python3.10/dist-packages/langchain/schema/vectorstore.py in <listcomp>(.0)
118 """
119 # TODO: Handle the case where the user doesn't provide ids on the Collection
--> 120 texts = [doc.page_content for doc in documents]
121 metadatas = [doc.metadata for doc in documents]
122 return self.add_texts(texts, metadatas, **kwargs)
AttributeError: 'str' object has no attribute 'page_content'
I believe the issue is in your generate_tokens
method. Instead of returning a list of documents, it is returning a list of string. Please see the documentation for split_text
here: https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#langchain.text_splitter.RecursiveCharacterTextSplitter.split_text.
I believe the fix should be to convert this list of string into a list of Document
objects. Can you please try the following:
def generate_tokens(s):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_text(s)
return text_splitter.create_documents(splits) #this should return the list of documents.