[SOLVED] How to create vector embeddings using SentenceTransformers?

How to create vector embeddings using SentenceTransformers?

I found this code: https://github.com/pixegami/langchain-rag-tutorial/blob/main/create_database.py

It takes the document, splits it into chunks, creates vector embeddings for each chunk, and saves those into Chroma Database. However, the source code uses OpenAI key to create embeddings. Since I don't have access to OpenAI I tried to use free alternative, - SentenceTransformers. But I wasn't able to rewrite the code properly.

Here is my curent attempt:

from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain.vectorstores.chroma import Chroma
import os
import shutil

CHROMA_PATH = "chroma"
DATA_PATH = "data/books"

embedder = SentenceTransformer("all-MiniLM-L6-v2")

def main():
    generate_data_store()

def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, embedder.encode, persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

if __name__ == "__main__":
    main()

This is the error that I get:

Traceback (most recent call last):
  File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 56, in <module>
    main()
  File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 15, in main
    generate_data_store()
  File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 20, in generate_data_store
    save_to_chroma(chunks)
  File "/media/andrew/Simple Tom/Robotics/Crew_AI/langchain-rag-tutorial/create_database.py", line 49, in save_to_chroma
    db = Chroma.from_documents(
  File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 778, in from_documents
    return cls.from_texts(
  File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 736, in from_texts
    chroma_collection.add_texts(
  File "/home/andrew/.local/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py", line 275, in add_texts
    embeddings = self._embedding_function.embed_documents(texts)
AttributeError: 'function' object has no attribute 'embed_documents'

I'm not a programmer. If someone could point out if it's at all possible to achive this without OpenAI key and if yes, then where my mistake is, that'd be great.

Solution

Define your embedding model, with HuggingFaceEmbeddings

from langchain_community.embeddings import HuggingFaceEmbeddings
embedder = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

Then embed the chunks into vectorDB

    db = Chroma.from_documents(
        documents=chunks, 
        embedding=embedder, 
        persist_directory=CHROMA_PATH
    )
    db.persist()