python azure-cognitive-services azure-cognitive-search langchain azure-openai

Why am i receiving "AttributeError: 'str' object has no attribute 'page_content'" when trying to add my embeddings to Azure Cognitive Search

I am extracting text from pdf documents and load it to Azure Cognitive Search for a RAG approach. Unfortunately this does not work. I am receiving the error message

AttributeError: 'str' object has no attribute 'page_content'

What I want to do is

Extract text from pdf via pymupdf - works
Upload it to Azuer Vector search as embeddings with vectors and `filename``
Query this through ChatGPT model

This is my code:

!pip install cohere tiktoken
!pip install openai==0.28.1
!pip install pymupdf
!pip install azure-storage-blob azure-identity
!pip install azure-search-documents --pre --upgrade
!pip install langchain

import fitz
import time
import uuid
import os
import openai

from PIL import Image
from io import BytesIO
from IPython.display import display

from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import AzureSearch
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

from google.colab import drive

OPENAI_API_BASE = "https://xxx.openai.azure.com"
OPENAI_API_KEY = "xxx"
OPENAI_API_VERSION = "2023-05-15"

openai.api_type = "azure"
openai.api_key = OPENAI_API_KEY
openai.api_base = OPENAI_API_BASE
openai.api_version = OPENAI_API_VERSION

AZURE_COGNITIVE_SEARCH_SERVICE_NAME = "https://xxx.search.windows.net"
AZURE_COGNITIVE_SEARCH_API_KEY = "xxx"
AZURE_COGNITIVE_SEARCH_INDEX_NAME = "test"

llm = AzureChatOpenAI(deployment_name="gpt35", openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)
embeddings = OpenAIEmbeddings(deployment_id="ada002", chunk_size=1, openai_api_key=OPENAI_API_KEY, openai_api_base=OPENAI_API_BASE, openai_api_version=OPENAI_API_VERSION)

acs = AzureSearch(azure_search_endpoint=AZURE_COGNITIVE_SEARCH_SERVICE_NAME,
                  azure_search_key = AZURE_COGNITIVE_SEARCH_API_KEY,
                  index_name = AZURE_COGNITIVE_SEARCH_INDEX_NAME,
                  embedding_function = embeddings.embed_query)
    
def generate_tokens(s):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
  splits = text_splitter.split_text(s)

  return splits

drive.mount('/content/drive')
folder = "/content/drive/.../pdf/"

page_content = ''
doc_content = ''
    
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    if os.path.isfile(file_path):
        print(f"Processing file: {file_path}")

        doc = fitz.open(file_path)
        for page in doc: # iterate the document pages
          page_content += page.get_text() # get plain text encoded as UTF-8
          doc_content += page_content

          d = generate_tokens(doc_content)

          # the following line throws the error
          # how can i add the chunks + filename to 
          # Azure Cognitive Search?

          acs.add_documents(documents=d)
    
        print(metadatas)
        print("----------")
        print(doc_content)
        count = len(doc_content.split())
        print("Number of tokens: ", count)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-33-d9eaff7ee027> in <cell line: 10>()
     31           all_texts.extend(d)
     32 
---> 33           acs.add_documents(documents=d)
     34 
     35           metadatas = [{"Source": f"{i}-pl"} for i in range(len(all_texts))]

1 frames
/usr/local/lib/python3.10/dist-packages/langchain/schema/vectorstore.py in <listcomp>(.0)
    118         """
    119         # TODO: Handle the case where the user doesn't provide ids on the Collection
--> 120         texts = [doc.page_content for doc in documents]
    121         metadatas = [doc.metadata for doc in documents]
    122         return self.add_texts(texts, metadatas, **kwargs)

AttributeError: 'str' object has no attribute 'page_content'

Solution

I believe the issue is in your generate_tokens method. Instead of returning a list of documents, it is returning a list of string. Please see the documentation for split_text here: https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#langchain.text_splitter.RecursiveCharacterTextSplitter.split_text.

I believe the fix should be to convert this list of string into a list of Document objects. Can you please try the following:

def generate_tokens(s):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
  splits = text_splitter.split_text(s)

  return text_splitter.create_documents(splits) #this should return the list of documents.