“search_analyzer”: “ik_smart”
},
“answer”: {
“type”: “text”,
“analyzer”: “ik_max_word”,
“search_analyzer”: “ik_smart”
}
}
}
}
其中 dims
为向量的长度。
查看创建的索引:
GET http://127.0.0.1:9200/medical_index
数据存入 ElasticSearch
引入 ElasticSearch
依赖库:
pip install elasticsearch -i https://pypi.tuna.tsinghua.edu.cn/simple
from elasticsearch import Elasticsearch
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
def embeddings_doc(doc, tokenizer, model, max_length=300):
encoded_dict = tokenizer.encode_plus(
doc,
add_special_tokens=True,
max_length=max_length,
padding=‘max_length’,
truncation=True,
return_attention_mask=True,
return_tensors=‘pt’
)
input_id = encoded_dict[‘input_ids’]
attention_mask = encoded_dict[‘attention_mask’]
前向传播
with torch.no_grad():
outputs = model(input_id, attention_mask=attention_mask)
提取最后一层的CLS向量作为文本表示
last_hidden_state = outputs.last_hidden_state
cls_embeddings = last_hidden_state[:, 0, :]
return cls_embeddings[0]
def add_doc(index_name, id, embedding_ask, ask, answer, es):
body = {
“ask_vector”: embedding_ask.tolist(),
“ask”: ask,
“answer”: answer
}
result = es.create(index=index_name, id=id, doc_type=“_doc”, body=body)
return result
def main():
模型下载的地址
model_name = ‘D:\AIGC\model\chinese-roberta-wwm-ext-large’
ES 信息
es_host = “http://127.0.0.1”
es_port = 9200
es_user = “elastic”
es_password = “elastic”
index_name = “medical_index”
数据地址
path = “D:\AIGC\dataset\Chinese-medical-dialogue-data\Chinese-medical-dialogue-data\Data_数据\IM_内科\内科5000-33000.csv”
分词器和模型
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
ES 连接
es = Elasticsearch(
[es_host],
port=es_port,
http_auth=(es_user, es_password)
)
读取数据写入ES
data = pd.read_csv(path, encoding=‘ANSI’)
f