import sys
sys.path.append('storage/wangyongpeng/FlagEmbedding/')
import re
import uvicorn
from fastapi import Body, FastAPI,BackgroundTasks,Request
from fastapi.middleware.cors import CORSMiddleware
import pydantic
from pydantic import BaseModel
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch
from langchain_core.documents import Document
import json
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from fastapi import FastAPI, Query
import time
# 初始化编码模型
embedding_function_old = HuggingFaceEmbeddings(model_name='/home/jovyan/storage/shaoyiming/bge-m3/',model_kwargs={'device': "cpu" })
embedding_function = HuggingFaceEmbeddings(model_name='/home/jovyan/storage/wangyongpeng/FlagEmbedding/model_output_V1_release',model_kwargs={'device': "cpu" })
# 初始化faiss向量库
vector1 = FAISS.load_local("/home/jovyan/storage/wangyongpeng/FlagEmbedding/tac_faiss_db_source", embedding_function_old, allow_dangerous_deserialization=True)
def convert_chunks2_documents(milvus_list):
documents = []
for item_json in milvus_list:
doc = item_json.get("doc")
item_json.pop("doc")
document = Document(page_content= doc, metadata=item_json)
documents.append(document)
return documents
def get_chunks_from_faiss():
#vector1= check_vector_store(vector1)
doc_dict = vector1.docstore._dict
import uuid
data = []
source_list = []
count=0
#while len(data)<30000:
for _index,_id in vector1.index_to_docstore_id.items():
embedded_content = vector1.index.reconstruct(_index)
doc_source = doc_dict[_id].metadata.get("source", "").split("/")[-1]
if doc_source:
source_list.append(doc_source)
page_content = doc_dict[_id].page_content
page_index = doc_dict[_id].metadata.get("page_index", "")
ref = doc_dict[_id].metadata.get("ref", "")
index = doc_dict[_id].metadata.get("index", "")
next_paragraph = doc_dict[_id].metadata.get("next_paragraph", "")
pre_paragraph = doc_dict[_id].metadata.get("pre_paragraph", "")
paragraph = doc_dict[_id].metadata.get("paragraph", "")
qaId = doc_dict[_id].metadata.get("qaId", "")
show_flag = doc_dict[_id].metadata.get("show_flag", True)
timestamp = str(time.time())
documentId = str(uuid.uuid3(uuid.NAMESPACE_DNS, doc_source+timestamp))
row={
# 'documentId': documentId,
# 'store_name':'jiang0506',
# 'type':'bge',
'doc': page_content,
# 'vector': embedded_content,
#'source_vector':source_vector,
'source': doc_source,
# 'page': page_index,
# 'index':index,
# 'ref': ref,
# 'show_flag':show_flag,
# 'qaId':qaId,
'paragraph':paragraph,
'pre_paragraph':pre_paragraph,
'next_paragraph':next_paragraph
}
data.append(row)
print(len(data))
return data
def put_txt_to_faiss():
# 先拿到chunks
raw_list = get_chunks_from_faiss()
documents = convert_chunks2_documents(raw_list)
print(len(documents))
db_filepath = "/home/jovyan/storage/wangyongpeng/FlagEmbedding/tac_faiss_db_target"
# 初始化 # 添加文档
vector_store = FAISS.from_documents(documents, embedding_function)
vector_store.save_local(db_filepath)
if __name__ == "__main__":
put_txt_to_faiss()
04-01
4750
03-08
1407