FoolNLTK是一个使用双向 LSTM 构建的中文处理工具包,该工具不仅可以实现分词、词性标注和命名实体识别,同时还能使用用户自定义字典加强分词的效果。
中文处理工具包 GitHub 地址:https://github.com/rockyzhengwu/FoolNLTK
本文中应用了FoolNLTK,对三体的第一部中的命名实体进行了分析,统计了相关人物,相关企业和相关组织在文中出现的次数占比。
贴上代码:
import fool
person_dict = {}
company_dict = {}
org_dict = {}
person_count = 0
company_count =0
org_count =0
def dict2list(dic:dict):
''' 将字典转化为列表 '''
keys = dic.keys()
vals = dic.values()
lst = [(key, val) for key, val in zip(keys, vals)]
return lst
f =open('E:\\santi\\santi1.txt')
lines = f.readlines()
for line in lines:
line = line.replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '').replace('=', '').replace(u'\u3000', u'')
words, ners = fool.analysis(line)
for ner in ners:
ner_type = ner[2]
ner_name = ner[3]
#滤除长度只有1的命名实体
if len(ner_name)==1:
continue
if ner_type == 'company':
if ner_name not in company_dict.keys():
company_dict[ner_name] = 1
company_count +=1
else:
company_dict[ner_name] += 1
elif ner_type == 'person':
if ner_name not in person_dict.keys():
person_dict[ner_name] = 1
person_count +=1
else:
person_dict[ner_name] += 1
elif ner_type == 'org':
if ner_name not in org_dict.keys():
org_dict[ner_name] = 1
org_count +=1
else:
org_dict[ner_name] += 1
for person in person_dict:
person_dict[person] /= person_count
for company in company_dict:
company_dict[company] /= company_count
for org in org_dict:
org_dict[org] /= org_count
print("相关人物:",sorted(dict2list(person_dict),key = lambda x:x[1],reverse=True),
"相关企业:",sorted(dict2list(company_dict),key = lambda x:x[1],reverse=True),
"相关组织:",sorted(dict2list(org_dict),key = lambda x:x[1],reverse=True))
结果如下:
person =[('汪淼', 0.7758007117437722), ('叶文洁', 0.48398576512455516), ('杨卫宁', 0.18505338078291814), ('周文王', 0.1601423487544484), ('审问者', 0.15658362989323843), ('大史', 0.1494661921708185), ('秦始皇', 0.12455516014234876), ('丁仪', 0.099644128113879), ('冯·诺伊曼', 0.07473309608540925), ('史强', 0.05693950177935943), ('杨母', 0.05693950177935943), ('伊文斯', 0.05693950177935943), ('常伟思', 0.0498220640569395), ('魏成', 0.0498220640569395), ('沙瑞山', 0.046263345195729534), ('墨子', 0.046263345195729534), ('爱因斯坦', 0.042704626334519574), ('白沐霖', 0.042704626334519574), ('雷志成', 0.042704626334519574), ('雷政委', 0.042704626334519574), ('潘寒', 0.042704626334519574), ('申玉菲', 0.03558718861209965), ('绍琳', 0.03202846975088968), ('杨总', 0.03202846975088968), ('叶老师', 0.03202846975088968), ('牛顿', 0.03202846975088968), ('斯坦顿', 0.03202846975088968), ('叶哲泰', 0.028469750889679714), ('红岸', 0.02491103202846975), ('纣王', 0.02491103202846975), ('小汪', 0.02491103202846975), ('监听员', 0.02491103202846975), ('冬冬', 0.021352313167259787), ('红卫兵', 0.017793594306049824), ('张主任', 0.017793594306049824), ('指指', 0.017793594306049824), ('杨冬', 0.017793594306049824), ('姬昌', 0.017793594306049824), ('麦克', 0.017793594306049824), ('叶文治', 0.017793594306049824), ('执政官', 0.017793594306049824), ('斯坦', 0.014234875444839857), ('程丽华', 0.014234875444839857), ('汪教授', 0.014234875444839857), ('伏羲', 0.014234875444839857), ('口口口', 0.014234875444839857), ('教皇', 0.014234875444839857), ('伽利略', 0.014234875444839857), ('上校', 0.014234875444839857), ('智子', 0.014234875444839857), ('阮老师', 0.010676156583629894), ('程代表', 0.010676156583629894), ('这人', 0.010676156583629894), ('常将军', 0.010676156583629894), ('丁博士', 0.010676156583629894), ('汪森', 0.010676156583629894), ('孔子', 0.010676156583629894), ('徐冰冰', 0.010676156583629894), (