I have created a collection in Weaviate, and ingested some documents into the Weaviate database using LlamaIndex. When I used the default search, I found that it was retrieving wrong documents the whole time. After that, I tested BM25 search, and it was giving high scores to other document, despite copying the entire phrase from the expected document.
Document of interest: downloaded Article from https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1627185 as PDF and stored locally. I have other 20 documents to be ingested together for retrieval testing.
Imports
# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.query import MetadataQuery
# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
Creating an Index with Weaviate Database
# Creating a Weaviate collection
def create_collection(client, collection_name):
client.collections.create(
collection_name,
vectorizer_config=Configure.Vectorizer.text2vec_transformers(),
vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE)
reranker_config=Configure.Reranker.transformers(),
inverted_index_config=Configure.inverted_index(
bm25_b=0.7,
bm25_k1=1.25,
index_null_state=True,
index_property_length=True,
index_timestamps=True
),
)
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
create_collection(client, index_name)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
documents = SimpleDirectoryReader(input_dir=doc_folder)
nodes = node_parser.get_nodes_from_documents(documents)
index.insert_nodes(nodes)
return index
client = weaviate.connect_to_local()
index_name = "LlamaIndex"
doc_folder = "/path/to/doc_folder"
create_weaviate_index(client, index_name, doc_folder)
Using LlamaIndex
query_engine = index.as_query_engine()
question = "EMA was created in 2001 to?" # Took partial string from document
response = query_engine.query(question)
print(response)
for node in response.source_nodes:
print(node.metadata) # Did not retrieve the document that I copied the string from
Using Weaviate hybrid search, alpha set to 0
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
query_vector = embed_model.get_query_embedding(question)
response = collection.query.hybrid(
query=question,
vector=query_vector
limit=5,
alpha=0,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
Using Weaviate bm25 search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
response = collection.query.bm25(
query=question,
limit=5,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
Using Weaviate near_text search
collection = client.collections.get("LlamaIndex")
question = "EMA was created in 2001 to?" # Took partial string
response = collection.query.near_text(
query=question,
limit=5,
return_metadata=MetadataQuery(
distance=True,
certainty=True,
score=True,
explain_score=True
)
)
for obj in response.objects:
print(f"METADATA: {obj.metadata}") # Did not retrieve the document that I copied the string from
Hi! Duda from Weaviate here :)
I have put together some code based on yours that maybe can help you.
I am not sure what vectorizer you are using. This example will use OpenAi.
We have some recipes on ollama here: https://github.com/weaviate/recipes/tree/main/integrations/llm-frameworks/llamaindex
ps: I have used two pdfs files located here, but you can have any pdfs under the pdfs
folder that it should also work:
#!pip3 install -U weaviate-client llama_index llama-index-readers-file llama-index-embeddings-openai
# Weaviate
import weaviate
from weaviate.classes.config import Configure, VectorDistances, Property, DataType
from weaviate.util import generate_uuid5
from weaviate.classes.query import MetadataQuery
# LlamaIndex
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import os
import openai
#os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]
embed_model = OpenAIEmbedding(embed_batch_size=10)
Settings.embed_model = embed_model
# lets test out llamaindex embedd model
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
embeddings = embed_model.get_text_embedding(
"Open AI new Embeddings models is great."
)
print(embeddings[:5])
# Creating a Weaviate collection
def create_collection(client, collection_name):
client.collections.create(
collection_name,
generative_config=Configure.Generative.openai(),
vectorizer_config=Configure.Vectorizer.text2vec_openai(model="text-embedding-3-small"),
vector_index_config=Configure.VectorIndex.hnsw(distance_metric=VectorDistances.COSINE),
reranker_config=Configure.Reranker.transformers(),
inverted_index_config=Configure.inverted_index(
bm25_b=0.7,
bm25_k1=1.25,
index_null_state=True,
index_property_length=True,
index_timestamps=True
),
)
# Create index using LlamaIndex
def create_weaviate_index(client, index_name, doc_folder):
create_collection(client, index_name)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
documents = SimpleDirectoryReader(input_dir=doc_folder).load_data()
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(
documents, show_progress=False
)
index.insert_nodes(nodes)
return index
client = weaviate.connect_to_local()
index_name = "LlamaIndex"
#
# WARNING THIS WILL DELETE IF EXISTS
#
client.collections.delete(index_name)
doc_folder = "./pdfs"
create_weaviate_index(client, index_name, doc_folder)
# querying
collection = client.collections.get("LlamaIndex")
collection.query.fetch_objects(include_vector=True, limit=1).objects[0].vector
# querying Weaviate directly
collections = client.collections.get("LlamaIndex")
for object in collections.query.bm25("food").objects:
print(object.properties)
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name, text_key="content")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents([], storage_context=storage_context)
query_engine = index.as_query_engine()
#filtering
from llama_index.core.vector_stores import (
MetadataFilter,
MetadataFilters,
FilterOperator,
)
filters = MetadataFilters(
filters=[
MetadataFilter(key="file_name", operator=FilterOperator.EQ, value="brazil"),
]
)
retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is the traditional food of this country?")
# generating an answer
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from IPython.display import Markdown, display
filters = MetadataFilters(
filters=[ExactMatchFilter(key="file_name", value="netherlands")]
)
query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("What is the food of this country?")
print("{response}")