llama-indexweaviateretrieval-augmented-generation

Unable to get expected results using BM25 or any search functions in Weaviate


I have created a collection in Weaviate, and ingested some documents into the Weaviate database using LlamaIndex. When I used the default search, I found that it was retrieving wrong documents the whole time. After that, I tested BM25 search, and it was giving high scores to other document, despite copying the entire phrase from the expected document.

Server Setup Information

Document Preparation

Document of interest: downloaded Article from https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1627185 as PDF and stored locally. I have other 20 documents to be ingested together for retrieval testing.

Python Setup Information

Imports

# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.query import MetadataQuery

# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter

Creating an Index with Weaviate Database

# Creating a Weaviate collection
def create_collection(client, collection_name):
  client.collections.create(
    collection_name,
    vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
    vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE)
    reranker_config=Configure.Reranker.transformers(),
    inverted_index_config=Configure.inverted_index(
      bm25_b=0.7,
      bm25_k1=1.25,
      index_null_state=True,
      index_property_length=True,
      index_timestamps=True
    ),
  )
 
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
  create_collection(client, index_name)
  vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
  storage_context = StorageContext.from_defaults(vector_store=vector_store)
  index = VectorStoreIndex.from_documents([], storage_context=storage_context)
  documents = SimpleDirectoryReader(input_dir=doc_folder)
  nodes = node_parser.get_nodes_from_documents(documents)
  index.insert_nodes(nodes)
  return index

client = weaviate.connect_to_local()
index_name = "LlamaIndex"
doc_folder = "/path/to/doc_folder"
create_weaviate_index(client, index_name, doc_folder)

Querying with documents

Using LlamaIndex

query_engine = index.as_query_engine()
question = "EMA was created in 2001 to?" # Took partial string from document
response = query_engine.query(question)
print(response)

for node in response.source_nodes:
  print(node.metadata) # Did not retrieve the document that I copied the string from

Using Weaviate hybrid search, alpha set to 0

collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
query_vector = embed_model.get_query_embedding(question)

response = collection.query.hybrid(
  query=question,
  vector=query_vector
  limit=5,
  alpha=0,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from

Using Weaviate bm25 search

collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
response = collection.query.bm25(
  query=question,
  limit=5,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from

Using Weaviate near_text search

collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string 
response = collection.query.near_text(
  query=question,
  limit=5,

  return_metadata=MetadataQuery(
    distance=True,
    certainty=True,
    score=True,
    explain_score=True
  )
)

for obj in response.objects:
  print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from

Solution

  • Hi! Duda from Weaviate here :)

    I have put together some code based on yours that maybe can help you.

    I am not sure what vectorizer you are using. This example will use OpenAi.

    We have some recipes on ollama here: https://github.com/weaviate/recipes/tree/main/integrations/llm-frameworks/llamaindex

    ps: I have used two pdfs files located here, but you can have any pdfs under the pdfs folder that it should also work:

    #!pip3 install -U weaviate-client llama_index llama-index-readers-file llama-index-embeddings-openai
    
    # Weaviate
    import weaviate
    from weaviate.classes.config import Configure, VectorDistances, Property, DataType
    from weaviate.util import generate_uuid5
    from weaviate.classes.query import MetadataQuery
    
    # LlamaIndex
    from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
    from llama_index.vector_stores.weaviate import WeaviateVectorStore
    from llama_index.core.node_parser import SentenceSplitter
    
    from llama_index.embeddings.openai import OpenAIEmbedding
    from llama_index.core import Settings
    
    import os
    import openai
    
    #os.environ["OPENAI_API_KEY"] = ""
    openai.api_key = os.environ["OPENAI_API_KEY"]
    
    embed_model = OpenAIEmbedding(embed_batch_size=10)
    Settings.embed_model = embed_model
    
    
    # lets test out llamaindex embedd model
    from llama_index.embeddings.openai import OpenAIEmbedding
    
    embed_model = OpenAIEmbedding(model="text-embedding-3-small")
    
    embeddings = embed_model.get_text_embedding(
        "Open AI new Embeddings models is great."
    )
    
    print(embeddings[:5])
    
    
    # Creating a Weaviate collection
    def create_collection(client, collection_name):
        client.collections.create(
            collection_name,
            generative_config=Configure.Generative.openai(),
            vectorizer_config=Configure.Vectorizer.text2vec_openai(model="text-embedding-3-small"),
            vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
            reranker_config=Configure.Reranker.transformers(),
            inverted_index_config=Configure.inverted_index(
            bm25_b=0.7,
            bm25_k1=1.25,
            index_null_state=True,
            index_property_length=True,
            index_timestamps=True
            ),
        )
     
    # Create index using LlamaIndex
    def create_weaviate_index(client, index_name, doc_folder):
        create_collection(client, index_name)
        vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents([], storage_context=storage_context)
        documents = SimpleDirectoryReader(input_dir=doc_folder).load_data()
        node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
    
    
        nodes = node_parser.get_nodes_from_documents(
            documents, show_progress=False
        )    
        index.insert_nodes(nodes)
        return index
    
    client = weaviate.connect_to_local()
    index_name = "LlamaIndex"
    #
    # WARNING THIS WILL DELETE IF EXISTS
    #
    client.collections.delete(index_name)
    doc_folder = "./pdfs"
    create_weaviate_index(client, index_name, doc_folder)
    
    # querying
    collection = client.collections.get("LlamaIndex")
    collection.query.fetch_objects(include_vector=True, limit=1).objects[0].vector
    
    # querying Weaviate directly
    collections = client.collections.get("LlamaIndex")
    for object in collections.query.bm25("food").objects:
        print(object.properties)
    
    vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents([], storage_context=storage_context)
    query_engine = index.as_query_engine()
    
    #filtering
    from llama_index.core.vector_stores import (
        MetadataFilter,
        MetadataFilters,
        FilterOperator,
    )
    
    filters = MetadataFilters(
        filters=[
            MetadataFilter(key="file_name", operator=FilterOperator.EQ, value="brazil"),
        ]
    )
    
    retriever = index.as_retriever(filters=filters)
    retriever.retrieve("What is the traditional food of this country?")
    
    # generating an answer
    from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
    from IPython.display import Markdown, display
    filters = MetadataFilters(
        filters=[ExactMatchFilter(key="file_name", value="netherlands")]
    )
    query_engine = index.as_query_engine(filters=filters)
    response = query_engine.query("What is the food of this country?")
    print("{response}")