一、bertService安装(tensorflow安装--对应版本)
可以自行百度,网络安装方案很多,踩坑很多;大家可以参考一下
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorflow==1.14.0
#解决安装包问题
pip install -i https://pypi.douban.com/simple/ " pytest-cov>=2.0 "
pip install -i https://pypi.douban.com/simple/ " pytest-filter-subpackage>=0.1 "
#解决pip问题
python -m pip install --upgrade setuptools
python -m pip install --upgrade pip
#解决spyder问题
pip install -i https://pypi.douban.com/simple/ " pyqt5<5.13"
#解决protobuf版本太高问题
pip install protobuf==3.19.0
二、bertService启动
# -*- coding: utf-8 -*-
from bert_serving.server import BertServer
from bert_serving.server.helper import get_args_parser
def main():
args = get_args_parser().parse_args(['-model_dir', '/Users/alibaba/Downloads/fintune_model',
'-port', '86500',
'-port_out', '86501',
'-max_seq_len', '512',
'-num_work', '2',
'-cpu'])
bs = BertServer(args)
bs.start()
if __name__ == "__main__":
main()
注意事项 -model_dir:需要填写你本地model的地址,显示如下表示服务启动成功
脚本启动方式,参数配置参考python代码
bert-serving-start -model_dir /Users/alibaba/Downloads/fintune_model -num_worker=2
三、bertService二次排序
# 导入bert客户端
from bert_serving.client import BertClient
import numpy as np
import pandas as pd
class SimilarModel:
def __init__(self):
# ip默认为本地模式,如果bert服务部署在其他服务器上,修改为对应ip
self.bert_client = BertClient(port=86500, port_out=86501, show_server_config=True, timeout=1000000)
def close_bert(self):
self.bert_client.close()
def get_sentence_vec(self,sentence):
'''
根据bert获取句子向量
:param sentence:
:return:
'''
return self.bert_client.encode([sentence])[0]
def cos_similar(self,sen_a_vec, sen_b_vec):
'''
计算两个句子的余弦相似度
:param sen_a_vec:
:param sen_b_vec:
:return:
'''
vector_a = np.mat(sen_a_vec)
vector_b = np.mat(sen_b_vec)
num = float(vector_a * vector_b.T)
denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
cos = num / denom
return cos
if __name__=='__main__':
# 从候选集condinates 中选出与sentence_a 最相近的句子
query = open("/Users/alibaba/Desktop/query.txt",'r').readlines()
condinates = open("/Users/alibaba/Desktop/recall.txt",'r').readlines()
sentence_a = query[0]
bert_result = []
bm25_score = []
fir_recall_title_list = []
query_list = []
bert_client = SimilarModel()
i = 0
for sentence_b in condinates:
i = i+1
print("计算第 " + str(i) + " 个文本相似度")
query_list.append( sentence_a )
sentence_a_vec = bert_client.get_sentence_vec(sentence_a)
fir_recall_title = sentence_b.split(" ")[1]
fir_recall_title_list.append(fir_recall_title)
bm25_score.append( sentence_b.split(" ")[2].replace("\n", "") )
fir_recall_title_vec = bert_client.get_sentence_vec(fir_recall_title)
cos_sim = bert_client.cos_similar(sentence_a_vec,fir_recall_title_vec)
bert_result.append(round(cos_sim, 3))
bert_client.close_bert()
second_sort_res = pd.DataFrame(
{
"query": query_list,
"reacll": fir_recall_title_list,
"bm25_score": bm25_score,
"bert_result": bert_result
}
)
second_sort_res['bm25_score'] = second_sort_res['bm25_score'].astype(float)
second_sort_res['bert_result'] = second_sort_res['bert_result'].astype(float)
if '停车' in second_sort_res['query'][0]:
second_sort_res = second_sort_res.sort_values(by="bert_result" , ascending=False)
else:
second_sort_res = second_sort_res.sort_values(by="bm25_score" , ascending=False)
second_sort_res = second_sort_res.drop_duplicates(inplace=False)
second_sort_res.iloc[0:50,:].to_csv("/Users/alibaba/Desktop/second_sort_resut.csv",index=False)
print("二次排序完成, 召回排名前 " + str(len(condinates)) + " 工单")
四、粗排参考博文
Lucene bm25 结合 jieba中文分词搜索_深挖技术点滴-CSDN博客_jieba lucene
五、思考(bert是否适合做相似度计算)
- bert模型不适合做搜索召回,因为它的机理实现的是基于类别相似最小、不相似最大,通过此类方法进行fintune后,结果的间隙不明显,无法区分
- 网上资料可以用bert各个pool层输出作为相似度计算,bert-as-service采用的是这种
- 可以考虑使用双塔模型做搜索召回