I'm trying to create a vector DB which will be populated with embeddings of articles from my employer's blog.
I've got a Milvus instance up and running and am able to follow the walkthrough on the Langchain website.
Based on the walkthrough, my implementation so far looks something like this:
def parseWPDataFile(filename):
# redacted for brevity
return {
'meta': parsed_headers,
'body': doc_body.strip()
}
parsed_doc = parseWPDataFile('sample_data.txt')
text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=['\n+'], chunk_size=5000, length_function=len)
docs = text_splitter.create_documents([parsed_doc['body']], [parsed_doc['meta']])
embeddings = OpenAIEmbeddings()
vector_db = Milvus.from_documents(docs, embeddings, connection_args={"host": "127.0.0.1", "port": "19530"})
This being my first time using a vector database, I'm a little confused by that last line. The documentation for Milvus.from_documents
indicates that it creates a vectorstore from documents, I guess, in memory. What I want is a persistent vectorstore that I can load stuff into and then later, in a separate script, pull from. I can't find any Langchain examples of this.
How do I create a persistent VectorStore, add to it, and get a reference to it later, in another script?
I used the Milvus library directly to achieve what I wanted..
First create the DB...
import sys
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, MilvusClient
# print out some info about the DB and kill the script, so we don't insert duplicate data as this has already been run
client = MilvusClient(uri="http://localhost:19530")
res = client.describe_collection(collection_name="ot_chatbot")
print(res)
sys.exit()
connections.connect(host="localhost",port="19530")
# primary id key
item_id = FieldSchema(
name="id",
dtype=DataType.INT64,
is_primary=True
)
# text field to hold text content of embeddings
text = FieldSchema(
name="text",
dtype=DataType.VARCHAR,
max_length=50000
)
# vector field to hiold embeddings
embeddings = FieldSchema(
name="embeddings",
dtype=DataType.FLOAT_VECTOR,
dim=1536 # for the "text-embedding-ada-002" embedding model
)
# source of the embedding (id, ourtownamerica.com)
source = FieldSchema(
name="source",
dtype=DataType.VARCHAR,
max_length=20
)
# a URL pointing to the source of the data, if any
url = FieldSchema(
name="url",
dtype=DataType.VARCHAR,
max_length=250
)
# a title, if any, of the wordpress article, for example
title = FieldSchema(
name="source_title",
dtype=DataType.VARCHAR,
max_length=250
)
# define collection schema
schema = CollectionSchema(
fields=[item_id, text, embeddings, source, url, title],
description="OT Chatbot",
enable_dynamic_field=False,
auto_id=True
)
# define the collection
collection = Collection(
name="ot_chatbot",
schema=schema,
using='default'
)
# create an index to speed up queries
collection.create_index(
field_name="embeddings",
index_params={"metric_type":"IP","index_type":"IVF_FLAT","params":{"nlist":16384}}
)
Then I was able to insert data into the DB like this...
from pymilvus import connections, MilvusClient
import os
import sys
import constants
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
os.environ["OPENAI_API_KEY"] = constants.APIKEY
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
client = MilvusClient(uri="http://localhost:19530")
# Parses the txt documents in wp-data
def parseWPDataFile(filename):
# redacted for brevity
return {
'meta': parsed_headers,
'body': doc_body.strip()
}
parsed_doc = parseWPDataFile('sample_data.txt')
text_splitter = RecursiveCharacterTextSplitter(is_separator_regex=True, separators=['\n+'], chunk_size=50000, length_function=len)
docs = text_splitter.create_documents([parsed_doc['body']], [parsed_doc['meta']])
for doc in docs:
meta = doc.metadata
text = doc.page_content
vector = embeddings.embed_query(text)
print(vector[:5])
res = client.insert(
collection_name="ot_chatbot",
data=[{
"text": text,
"embeddings": vector,
"source": "wp_site",
"url": meta['Post URL'],
"source_title": meta['Post title']
}]
)
print(res)