结巴是开源库,中文分词,使用简单,可以添加用户字典
https://github.com/fxsjy/jieba
使用示例
from xml.etree import ElementTree
import re
import jieba
import jieba.posseg as pseg
jieba.load_userdict("dict.txt")
#读root
root = ElementTree.parse(r"爬虫.xml")
persons = root.findall("person")
#写root
root = ElementTree.Element("documents")
for person in persons:
name = person.find("name")
dis = person.find("dis")
#s删除作品名
text = re.sub(r'《.*》', '', str(dis.text))
words = pseg.cut(text)
try:
for w in words:
if len(w.word) == 1:
continue
#nr为人名标志
if str(w.flag) == "nr":
print(w.word)
except:
pass