import pandas as pd
import numpy as np
# 导入数据初始化
data = pd.read_csv('ner_dataset.csv', encoding='latin1')
data = data.fillna(method='ffill')
data.tail(10)
输出
2 预定义数据结构
# 预定义数据结构
words =list(set(data['Word'].values))# 单词表print(words[:50])
n_words =len(words)# 查看单词总个数
n_words
输出
3 投票模型代码
from sklearn.base import BaseEstimator, TransformerMixin
classMajorityVotingTagger(BaseEstimator, TransformerMixin):deffit(self, X, y):"""
x: list of words
y: list of tags
"""
word2cnt ={}
tags =[]for x, t inzip(X, y):if t notin tags:
tags.append(t)if x in word2cnt:if t in word2cnt[x]:
word2cnt[x][t]+=1else:
word2cnt[x][t]=1else:
word2cnt[x]={t:1}
self.mjvote ={}for k, d in word2cnt.items():# k : d, d# Indian: {B_gpe: 4, B_geo:1, ...}# 每个单词有哪些实体标签,{单词1:{实体名称1:次数, 实体名称2:次数}, 单词2:{实体名称1:次数, 实体名称2:次数}}
self.mjvote[k]=max(d, key=d.get)# 取次数最多的实体名称defpredict(self, X, y =None):"""
预测内存中的标签, 如果单词是未知的,则预测为O
"""return[self.mjvote.get(x,'O')for x in X]
4 数据预处理
words = data['Word'].values.tolist()
tags = data['Tag'].values.tolist()print(words[:10], tags[:10], sep ='\n')
输出
5 模型预测
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
# 交叉验证
pred = cross_val_predict(estimator = MajorityVotingTagger(), X = words, y = tags, cv =5)# 计算验证报告
report = classification_report(y_pred = pred, y_true = tags)print(report)