“`
-- coding: utf-8 --
import os,sys
import re
import jieba,codecs,math
import jieba.posseg as pseg
import string
from zhon.hanzi import punctuation
names = {} # 姓名字典,字典的键为人物名称,值为该人物在全文中出现的次数
relationships = {} # 关系字典,人物关系的有向边,该字典的键为有向边的起点,值为一个字典edge,
# edge的键是有向边的终点,值是有向边的权值,代表两个人物之间联系的紧密程度
lineNames = [] # 每集内人物关系,保存对每一段分词得到当前集中出现的人物名称,lineName[i]是一个列表,列表中存储第i集中出现过的人物。
jieba.load_userdict(“dict.txt”) # 加载字典
with open(“introduction.txt”,”r”) as f:
for line in f.readlines():
line = line.decode(‘GB2312’)
line = line.encode(‘utf-8’)
line = re.sub(ur”[%s]+” % punctuation, “”, line.decode(“utf-8”)) # 去标点
poss = pseg.cut(line)
lineNames.append([])
for w in poss:
if w.flag != “nr” or len(w.word)&l