本文的主要想做的事情就是尽可能的理解文本匹配下面这段话:
1.首先进行数据的前处理去除特殊符号可以下载stop_words.txt中文的特殊符号得到mat.xlsx
import pandas as pd
data=pd.read_excel('mat.xlsx') #mat.xlsx 为物料数据 这里不方便放出来 数据可以自己去淘宝 京东爬取
dd=set(data['product_title'].tolist()) # 获取商品名称
2.bm25算法模型结构 这里面需要注意一下 参数的设置调整,我这边也是调试了一下场景是短文本,还可以继续调整
import math
from six import iteritems
from six.moves import xrange
# BM25 parameters.
PARAM_K1 = 10
PARAM_B = 0.9
EPSILON = 0.25
class BM25(object):
def __init__(self, corpus):
self.corpus_size = len(corpus)
self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
self.idf = {}
self.initialize()
def initialize(self):
for document in self.corpus:
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
for word, freq in iteritems(frequencies):
if word not in self.df: