基于结巴分词的基础中文词语去歧实现

最新推荐文章于 2021-05-08 11:02:24 发布

史前大洪水

最新推荐文章于 2021-05-08 11:02:24 发布

阅读量552

点赞数

分类专栏： lab 文章标签：中文词语去歧

本文链接：https://blog.csdn.net/u011010851/article/details/74352639

版权

lab 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

词语消歧工作即根据一句话或一段话中的上下文，对某些存在多个意思的词语，自动的选择出合适当前语境的词义。eg：打人，打车，打基础。均属于打字的不同意义。
基础阶段，采用的思路是爬虫-去停用词-分词-建立字典-抓取文本中关键词-参照字典对比-返回标签。以下代码可以实现对打字的区分。分词工具用的结巴。字典仅采用了结巴分词中自带的字典，未完全建立。

# -*- coding: utf-8 -*-
#-----------------------------------------------------
#   功能：读取excel文本，进行关键词筛选
#   作者：D.ana
#   日期：2017-07-4
#   语言：Python 2.7.6
#   环境：jieba（Must be installed）
#-----------------------------------------------------
import xlrd
from xlutils.copy import copy
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf8')
jieba.load_userdict("user.dict")
# import time
# import re
# import os
# import string
# import sys
# import math
# import os
# #coding=utf8
# import xlrd
# import json
# import xlwt
# from xlrd import open_workbook

#读excel文件，读一列文本
def read_excel(row,col,name):
    #打开工作表
    root="D:\\pycharm\\wordcut\\"
    dir=root+name
    print dir
    workbook=xlrd.open_workbook(dir)
    #print workbook.sheet_names()#获取表单名字

    #根据索引获取表单内容

    sheet=workbook.sheet_by_index(0)#获取新浪表单
    # 获取本表单有多少行
    row_num = sheet.nrows
    # 获取表单有多少列
    column_num = sheet.ncols
    urllist = []

    for nrow in range(row, row_num):
        url = sheet.cell(nrow, col).value
        urllist.append(url)
    # 将url以列表的形式返回
    return (urllist)

# 写excle的函数，暂时未使用
def savedata(list,name):
    root = "D:\\pycharm\\wordcut\\"
    dir = root + name
    print dir
    workbook = xlrd.open_workbook(dir)

    # 通过sheet_by_index()获取sheet没有write()方法
    wb = copy(workbook)
    # 通过get_sheet()获取的sheet有write()方法
    ws = wb.get_sheet(0)
    # print len(list)
    # for i in range(len(list)):
    #    ws.write(1, i+2, list[i])
    for i in range(len(list)):
        ws.write(i, 0, list[i].decode('utf-8')) #写入中文需要换编码模式

    wb.save(dir)
    print u'数据保存成功'

#分词模块,对excel读出的list进行处理，分词后的结果写入到txt文件
def splitSentence(list1,outputFile):
    fout = open(outputFile, 'w')  # 以写得方式打开文件
    for i in range(len(list1)):
        list2 = list(jieba.cut(list1[i].decode('utf-8')))  # 用结巴分词，对每行内容进行分词
        outStr = ''
        for word in list2:
            outStr += word
            outStr += ' '
        fout.write(outStr.strip().encode('utf-8') + '\n')  # 将分词好的结果写入到输出文件
    fout.close()

#对分词后的结果，按句找出关键词的搭配并记录行号，存入txt文档
def findchar(word,inputFile, outputFile1,outputFile2): #
    fin = open(inputFile, 'r')  # 以读的方式打开文件
    fout1 = open(outputFile1, 'w')  # 以写得方式打开文件
    fout2 = open(outputFile2, 'w')  # 以写得方式打开文件
    linecount = 0
    word = word
    for eachLine in fin:
        linecount = linecount+1
        line = eachLine.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格，并转为Unicode进行处理
        word_a = word.decode('utf-8','ignore')
        if line.find(word_a) >=0 :
            pos = line.find(word_a)
            if line[pos+1]==' 'and line[pos-1]==' ': #如果该词后是空格，该词结束，需进行匹配
                i = pos-3
                while i<=pos+4 :
                     fout2.write(line[i].encode('utf-8'))  # 将分词好的结果写入到输出文件
                     i=i+1
                fout2.write('\n')
            else:
                tpos = line.rfind(' ',0,pos) #找前一个空格
                #print tpos
                npos = line.find(' ',pos,-1) #找后一个空格
                #print 'npos',npos
                i=tpos+1
                while i<=npos :
                    fout1.write(line[i].encode('utf-8'))  # 将分词好的结果写入到输出文件
                    i=i+1
                fout1.write(str(linecount))
                fout1.write('\n')
    fin.close()
    fout1.close()
    fout2.close()

#按关键词搭配在字典中查询，如果有该词，则标记为1
def findcharindic(inputFile,outputFile) :
    fin = open(inputFile, 'r')  # 以读的方式打开文件
    fout = open(outputFile, 'w')  # 以写得方式打开文件

    for eachLine in fin:
        linecount = 0
        str1 = ''
        line = eachLine.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格，并转为Unicode进行处理
        i = 0
        while line[i] != ' ':
            str1 = str1+line[i]
            i = i+1
        linecount=line[i+1]
        # print 'string1',str1
        # print linecount
        index = dealdic('beat.txt',str1) #查字典函数
        # print index
        fout.write(str1.decode('utf-8')+' ')
        fout.write(str(linecount)+' ')
        fout.write(str(index))
        fout.write('\n')

    fin.close()
    fout.close()

#接收关键词，在字典中查询是否存在该词，返回标记
def dealdic(inputFile,str1) :
    fin = open(inputFile, 'r')  # 以读的方式打开文件
    str1 = str1
    index = 0
    for eachLine in fin:
        str2 = ''
        line = eachLine.strip().decode('utf-8', 'ignore')  # 去除每行首尾可能出现的空格，并转为Unicode进行处理
        i = 0
        while line[i]!= ' ':
            str2 = str2 + line[i]
            i = i+1
        if str1.decode('utf-8') == str2.decode('utf-8') :
            index = 1
    fin.close()
    return  index

#去停用词
def stopword(list) :
    stopwords = {}.fromkeys(['的', '包括', '等', '是','@'])
    listout = []

    for i in range(len(list)):
        text = list[i].decode('utf-8')
        segs = jieba.cut(text, cut_all=False)
        final = ''
        for seg in segs:
            seg = seg.encode('utf-8')
            if seg not in stopwords:
                final += seg
        listout.append(final)
    return listout


def oneword(word) :
    list = read_excel(1, 5, 'traindata.xls')  # excel-list
    listout = stopword(list) #list-list
    splitSentence(listout, 'wordcut-done.txt')  # list-txt
    # findchar(word,'wordcut-done.txt', 'wordmatch.txt', 'wordnomatch.txt')  # txt-txt
    # findcharindic('wordmatch.txt', 'remain.txt')  # txt-txt

def main():
    oneword('打')
if __name__ == '__main__':
    main()

因不断在测试生成的效果，中间结果都进行了保存，后面可以将过程改的更简练一些。

打王者荣耀 1 0
打几把 3 0
打电话 4 1
打游戏 5 1
棒打 6 1
打打榜 7 0
一打 8 1
打榜 9 0
打个水漂 1 0
打伏笔 1 0
打电话 1 1
打榜 1 0

结果格式如上，分别为词，序号，是否存在于词典
后期需要更新词典。这种方法依赖词典，效果不完全，自动化程度不够高。改进方向：SVM；Hownet; word2vec;依存句法分析

史前大洪水

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录