sklearn自带的CountVectorizer,不能按照自定义的分隔符分词,默认是空格,但是中文里也不免有英文名,实体等是空格分割的
自定义CountVectorizer实现的支持自定义分隔符,并且可以能快速计算出词语在文档中出现的次数
我的需求是利用paddle分词能准确的把实体分出来,然后利用CountVectorizer计算idf并存储起来,当实体识别服务启动的时候,我识别出文档中的实体后,通过计算tf,再和缓存的idf进行相乘,快速得出实体的权重。
如果有好的办法得到实体的权重,烦请留言告知,我实在不知道怎么得到实体的权重,只能想到这种笨拙的办法。
class CountVectorizer():
def __init__(self, pass_stop=True):
self.pass_stop = pass_stop # 提供停止词滤除功能,可禁止
self.cut_data = []
self.dict_ = {}
def fit(self, cut_data):
try:
self.elements_ = set()
for line in cut_data:
for x in line:
if self.pass_stop:
if len(x) == 1:
continue
self.elements_.add(x)
# 原元素
self.elements_ = np.sort(list(self.elements_))
# 编码
self.labels_ = np.arange(len(self.elements_)).astype(int)
# 生成字典
for i in range(len(self.elements_)):
self.dict_[str(self.elements_[i])] = self.labels_[i]
except Exception as e:
return []
def fit_transform(self, data, sep=' '):
try:
rows = []
cols = []
cut_data = [doc.split(sep) for doc in data]
self.fit(cut_data)
# print("cut_data: ", cut_data)
for i in range(len(cut_data)):
for x in cut_data[i]:
if self.pass_stop:
if len(x) == 1:
continue
rows.append(i)
cols.append(self.dict_[x])
vals = np.ones((len(rows),)).astype(int)
return sparse.csr_matrix((vals, (rows, cols)), shape=(len(data), len(self.labels_))).toarray()
except Exception as e:
return None
def get_feature_names(self):
try:
return self.elements_
except Exception as e:
return []
计算idf的代码
self.vectorizer = CountVectorizer()
def calc_idf(self, corpus):
idf_dic = {}
try:
# 获取每个词在该行(文档)中出现的次数
counts = self.vectorizer.fit_transform(corpus, self.sep)
# 获取词袋模型中的所有词语
word_list = self.vectorizer.get_feature_names()
# array转df
df = pd.DataFrame(data=counts, columns=word_list)
all_doc = len(counts)
print("全部文档数: ", all_doc)
# 将频次大于1的改成1,统计1的个数即可得到在出现该次的文档数
# IDF = log[总文档数/(出现该次的文档数+1)]
for w in word_list:
df[w] = df[w].apply(lambda x: 1 if x > 0 else 0)
has_token_count = df[w].sum()
idf = max(round(math.log(all_doc / (has_token_count + 1)), 4), 0)
idf_dic[w] = idf
except Exception as e:
logger.error(f"计算idf错误 {traceback.format_exc()}")
return idf_dic
示例代码
import pandas as pd
import math
vectorizer = CountVectorizer()
data = ['纽约市/Surge-Energy',
'初步/白糖交易商Czarnikow/IDDCTM系/Mac 口红',
'明确/证据/曼哈顿/爆炸事件']
# 获取每个词在该行(文档)中出现的次数
counts = vectorizer.fit_transform(data, sep='/')
print(counts)
# 获取词袋模型中的所有词语
word_list = vectorizer.get_feature_names()
print(word_list)
# array转df
df = pd.DataFrame(data=counts, columns=word_list)
all_doc = len(counts)
print("全部文档数: ", all_doc)
idf_dic = {}
# 将频次大于1的改成1,统计1的个数即可得到在出现该次的文档数
# IDF = log[总文档数/(出现该次的文档数+1)]
for w in word_list:
df[w] = df[w].apply(lambda x: 1 if x > 0 else 0)
has_token_count = df[w].sum()
idf = max(round(math.log(all_doc / (has_token_count + 1)), 4), 0)
idf_dic[w] = idf
print(idf_dic)