faiss向量数据库的使用以及讲过了,今天看看chroma 如何使用
存储向量数据,并持久化
chroma 向量数据文件默认保存在当前项目下,我们可以指定某个文件当成他的索引存储目录 如chromadata
def save_documents_each_chroma(documents, index="finance_index",slice_size= 100):
index = "chromadata/"+index
print("documents:", str(len(documents)))
docs = text_splitter.split_documents(documents)
print("docs:", str(len(docs)))
len_num = 0
tmp_num = 0
tmp_list = []
for doc in docs:
if len(doc.page_content) < 50:
print("当前内容过少跳过:",doc.page_content)
continue;
len_num+=1
tmp_num+=1
tmp_list.append(doc)
if tmp_num >= slice_size:
vectordb = Chroma.from_documents(documents=tmp_list,embedding=embeddings,persist_directory=index)
tmp_list = []
tmp_num = 0
print("当前第:",str(len_num))
##如果有剩余的不满于上述条件的,再多存一下
if tmp_list:
vectordb = Chroma.from_documents(documents=tmp_list,embedding=embeddings,persist_directory=index)
vectordb.persist()
vectordb = None
return vectordb
这样就能把数据保存进去了,下面我们查询试试看
langchain提供的查询方式针对不同向量数据库都差不多,示例
def search_documents_by_chroma(index="finance_index", query="", limit=5):
vectordb = Chroma(persist_directory=index, embedding_function=embeddings)
docs = vectordb.similarity_search(query, k=limit)
return docs
我们使用flask直观一点,调用试试看
@app.route('/search', methods=['GET','POST'])
def search():
# 调用接口,返回响应给客户端
query = request.values.get('query','')
print(query)
docs = search_documents_by_chroma(index,query,3)
print("all_len:",len(docs))
##转为字典输出
txt_list = documents2dict(docs)
return txt_list
php @妙手医生