从Faiss读取所有文档,并重新编码写入Faiss



import sys
sys.path.append('storage/wangyongpeng/FlagEmbedding/')

import re
import uvicorn
from fastapi import Body, FastAPI,BackgroundTasks,Request
from fastapi.middleware.cors import CORSMiddleware
import pydantic
from pydantic import BaseModel
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch    


from langchain_core.documents import Document
import json
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from fastapi import FastAPI, Query

import time

# 初始化编码模型
embedding_function_old = HuggingFaceEmbeddings(model_name='/home/jovyan/storage/shaoyiming/bge-m3/',model_kwargs={'device': "cpu" }) 

embedding_function = HuggingFaceEmbeddings(model_name='/home/jovyan/storage/wangyongpeng/FlagEmbedding/model_output_V1_release',model_kwargs={'device': "cpu" }) 


# 初始化faiss向量库
vector1 = FAISS.load_local("/home/jovyan/storage/wangyongpeng/FlagEmbedding/tac_faiss_db_source", embedding_function_old, allow_dangerous_deserialization=True)

def convert_chunks2_documents(milvus_list):
    documents = []
    for item_json in milvus_list:
        doc = item_json.get("doc")
        item_json.pop("doc")
        document = Document(page_content= doc, metadata=item_json)
        documents.append(document)
    return documents


def get_chunks_from_faiss():
    #vector1= check_vector_store(vector1)
    doc_dict = vector1.docstore._dict
    import uuid

    data = []
    source_list = []
    count=0
    #while len(data)<30000:
    for _index,_id in vector1.index_to_docstore_id.items():
        embedded_content = vector1.index.reconstruct(_index)
        doc_source = doc_dict[_id].metadata.get("source", "").split("/")[-1]
        if doc_source:
            source_list.append(doc_source)
        page_content = doc_dict[_id].page_content
        page_index = doc_dict[_id].metadata.get("page_index", "")
        ref = doc_dict[_id].metadata.get("ref", "")
        index = doc_dict[_id].metadata.get("index", "")
        next_paragraph = doc_dict[_id].metadata.get("next_paragraph", "")
        pre_paragraph = doc_dict[_id].metadata.get("pre_paragraph", "")
        paragraph = doc_dict[_id].metadata.get("paragraph", "")
        qaId = doc_dict[_id].metadata.get("qaId", "")
        show_flag = doc_dict[_id].metadata.get("show_flag", True)
        timestamp = str(time.time())
        documentId = str(uuid.uuid3(uuid.NAMESPACE_DNS, doc_source+timestamp))
        row={
            # 'documentId': documentId,
            # 'store_name':'jiang0506',
            # 'type':'bge',
            'doc': page_content, 
            # 'vector': embedded_content,
            #'source_vector':source_vector,
            'source': doc_source, 
            # 'page': page_index,
            # 'index':index,
            # 'ref': ref,
            # 'show_flag':show_flag,
            # 'qaId':qaId,
            'paragraph':paragraph,
            'pre_paragraph':pre_paragraph,
            'next_paragraph':next_paragraph
            } 
        data.append(row)

    print(len(data))
    return data




    
def put_txt_to_faiss():
    # 先拿到chunks
    raw_list = get_chunks_from_faiss()
    documents = convert_chunks2_documents(raw_list)
    print(len(documents))
    db_filepath = "/home/jovyan/storage/wangyongpeng/FlagEmbedding/tac_faiss_db_target"
    # 初始化 # 添加文档
    vector_store = FAISS.from_documents(documents, embedding_function) 
    vector_store.save_local(db_filepath)


if __name__ == "__main__":
    put_txt_to_faiss()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值