词语消歧工作即根据一句话或一段话中的上下文,对某些存在多个意思的词语,自动的选择出合适当前语境的词义。eg:打人,打车,打基础。均属于打字的不同意义。
基础阶段,采用的思路是爬虫-去停用词-分词-建立字典-抓取文本中关键词-参照字典对比-返回标签。以下代码可以实现对打字的区分。分词工具用的结巴。字典仅采用了结巴分词中自带的字典,未完全建立。
import xlrd
from xlutils.copy import copy
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf8')
jieba.load_userdict("user.dict")
def read_excel(row,col,name):
root="D:\\pycharm\\wordcut\\"
dir=root+name
print dir
workbook=xlrd.open_workbook(dir)
sheet=workbook.sheet_by_index(0)
row_num = sheet.nrows
column_num = sheet.ncols
urllist = []
for nrow in range(row, row_num):
url = sheet.cell(nrow, col).value
urllist.append(url)
return (urllist)
def savedata(list,name):
root = "D:\\pycharm\\wordcut\\"
dir = root + name
print dir
workbook = xlrd.open_workbook(dir)
wb = copy(workbook)
ws = wb.get_sheet(0)
for i in range(len(list)):
ws.write(i, 0, list[i].decode('utf-8'))
wb.save(dir)
print u'数据保存成功'
def splitSentence(list1,outputFile):
fout = open(outputFile, 'w')
for i in range(len(list1)):
list2 = list(jieba.cut(list1[i].decode('utf-8')))
outStr = ''
for word in list2:
outStr += word
outStr += ' '
fout.write(outStr.strip().encode('utf-8') + '\n')
fout.close()
def findchar(word,inputFile, outputFile1,outputFile2):
fin = open(inputFile, 'r')
fout1 = open(outputFile1, 'w')
fout2 = open(outputFile2, 'w')
linecount = 0
word = word
for eachLine in fin:
linecount = linecount+1
line = eachLine.strip().decode('utf-8', 'ignore')
word_a = word.decode('utf-8','ignore')
if line.find(word_a) >=0 :
pos = line.find(word_a)
if line[pos+1]==' 'and line[pos-1]==' ':
i = pos-3
while i<=pos+4 :
fout2.write(line[i].encode('utf-8'))
i=i+1
fout2.write('\n')
else:
tpos = line.rfind(' ',0,pos)
npos = line.find(' ',pos,-1)
i=tpos+1
while i<=npos :
fout1.write(line[i].encode('utf-8'))
i=i+1
fout1.write(str(linecount))
fout1.write('\n')
fin.close()
fout1.close()
fout2.close()
def findcharindic(inputFile,outputFile) :
fin = open(inputFile, 'r')
fout = open(outputFile, 'w')
for eachLine in fin:
linecount = 0
str1 = ''
line = eachLine.strip().decode('utf-8', 'ignore')
i = 0
while line[i] != ' ':
str1 = str1+line[i]
i = i+1
linecount=line[i+1]
index = dealdic('beat.txt',str1)
fout.write(str1.decode('utf-8')+' ')
fout.write(str(linecount)+' ')
fout.write(str(index))
fout.write('\n')
fin.close()
fout.close()
def dealdic(inputFile,str1) :
fin = open(inputFile, 'r')
str1 = str1
index = 0
for eachLine in fin:
str2 = ''
line = eachLine.strip().decode('utf-8', 'ignore')
i = 0
while line[i]!= ' ':
str2 = str2 + line[i]
i = i+1
if str1.decode('utf-8') == str2.decode('utf-8') :
index = 1
fin.close()
return index
def stopword(list) :
stopwords = {}.fromkeys(['的', '包括', '等', '是','@'])
listout = []
for i in range(len(list)):
text = list[i].decode('utf-8')
segs = jieba.cut(text, cut_all=False)
final = ''
for seg in segs:
seg = seg.encode('utf-8')
if seg not in stopwords:
final += seg
listout.append(final)
return listout
def oneword(word) :
list = read_excel(1, 5, 'traindata.xls')
listout = stopword(list)
splitSentence(listout, 'wordcut-done.txt')
def main():
oneword('打')
if __name__ == '__main__':
main()
因不断在测试生成的效果,中间结果都进行了保存,后面可以将过程改的更简练一些。
打王者荣耀 1 0
打几把 3 0
打电话 4 1
打游戏 5 1
棒打 6 1
打打榜 7 0
一打 8 1
打榜 9 0
打个水漂 1 0
打伏笔 1 0
打电话 1 1
打榜 1 0
结果格式如上,分别为词,序号,是否存在于词典
后期需要更新词典。这种方法依赖词典,效果不完全,自动化程度不够高。改进方向:SVM;Hownet; word2vec;依存句法分析