RAKE 中文分词与关键词提取

最新推荐文章于 2024-08-15 09:34:18 发布

南七小僧

最新推荐文章于 2024-08-15 09:34:18 发布

阅读量4.3k

点赞数 2

我是南七小僧，邮箱：xkk9866@yeah.net ，C9博士，前百度搜索AI平台产品负责人，欢迎交流思想碰撞。

本文链接：https://blog.csdn.net/qq_25439417/article/details/82995374

版权

import jieba
import jieba.posseg as pseg
import operator
import json
from collections import Counter


# Data structure for holding data
class Word():
    def __init__(self, char, freq = 0, deg = 0):
        self.freq = freq
        self.deg = deg
        self.char = char

    def returnScore(self):
        return self.deg/self.freq

    def updateOccur(self, phraseLength):
        self.freq += 1
        self.deg += phraseLength

    def getChar(self):
        return self.char

    def updateFreq(self):
        self.freq += 1

    def getFreq(self):
        return self.freq

# Check if contains num
def notNumStr(instr):
    for item in instr:
        if '\u0041' <= item <= '\u005a' or ('\u0061' <= item <='\u007a') or item.isdigit():
            return False
    return True

# Read Target Case if Json
def readSingleTestCases(testFile):
    with open(testFile) as json_data:
        try:
            testData = json.load(json_data)
        except:
            # This try block deals with incorrect json format that has ' instead of "
            data = json_data.read().replace("'",'"')
            try:
                testData = json.loads(data)
                # This try block deals with empty transcript file
            except:
                return ""
    returnString = ""
    for item in testData:
        try:
            returnString += item['text']
        except:
            returnString += item['statement']
    return returnString

def run(rawText):
    # Construct Stopword Lib
    swLibList = [line.rstrip('\n') for line in open(r"sp.txt",'r',encoding='utf-8')]
    # Construct Phrase Deliminator Lib
    conjLibList = [line.rstrip('\n') for line in open(r"spw.txt",'r',encoding='utf-8')]

    # Cut Text
    rawtextList = pseg.cut(rawText)

    # Construct List of Phrases and Preliminary textList
    textList = []
    listofSingleWord = dict()
    lastWord = ''
    poSPrty = ['m','x','uj','ul','mq','u','v','f']
    meaningfulCount = 0
    checklist = []
    for eachWord, flag in rawtextList:
        checklist.append([eachWord,flag])
        if eachWord in conjLibList or not notNumStr(eachWord) or eachWord in swLibList or flag in poSPrty or eachWord == '\n':
            if lastWord != '|':
                textList.append("|")
                lastWord = "|"
        elif eachWord not in swLibList and eachWord != '\n':
            textList.append(eachWord)
            meaningfulCount += 1
            if eachWord not in listofSingleWord:
                listofSingleWord[eachWord] = Word(eachWord)
            lastWord = ''

    # Construct List of list that has phrases as wrds
    newList = []
    tempList = []
    for everyWord in textList:
        if everyWord != '|':
            tempList.append(everyWord)
        else:
            newList.append(tempList)
            tempList = []

    tempStr = ''
    for everyWord in textList:
        if everyWord != '|':
            tempStr += everyWord + '|'
        else:
            if tempStr[:-1] not in listofSingleWord:
                listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
                tempStr = ''

    # Update the entire List
    for everyPhrase in newList:
        res = ''
        for everyWord in everyPhrase:
            listofSingleWord[everyWord].updateOccur(len(everyPhrase))
            res += everyWord + '|'
        phraseKey = res[:-1]
        if phraseKey not in listofSingleWord:
            listofSingleWord[phraseKey] = Word(phraseKey)
        else:
            listofSingleWord[phraseKey].updateFreq()

    # Get score for entire Set
    outputList = dict()
    for everyPhrase in newList:

        if len(everyPhrase) > 5:
            continue
        score = 0
        phraseString = ''
        outStr = ''
        for everyWord in everyPhrase:
            score += listofSingleWord[everyWord].returnScore()
            phraseString += everyWord + '|'
            outStr += everyWord
        phraseKey = phraseString[:-1]
        freq = listofSingleWord[phraseKey].getFreq()
        if freq / meaningfulCount < 0.01 and freq < 3 :
            continue
        outputList[outStr] = score

    sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)
    return sorted_list[:10]

if __name__ == '__main__':
    with open(r'E:\xkkAI\dazhao\industry\行业_34_.txt','r') as fp:
        text = ''
        for i in range(100):
            text += fp.readline()
        print(text)
        result = run(text)
        print(result)

RAKE（Rapid Automatic keyword extraction）介绍

RAKE算法思想

　　RAKE算法用来做关键词(keyword)的提取，实际上提取的是关键的短语(phrase)，并且倾向于较长的短语，在英文中，关键词通常包括多个单词，但很少包含标点符号和停用词，例如and，the，of等，以及其他不包含语义信息的单词。

　　RAKE算法首先使用标点符号（如半角的句号、问号、感叹号、逗号等）将一篇文档分成若干分句，然后对于每一个分句，使用停用词作为分隔符将分句分为若干短语，这些短语作为最终提取出的关键词的候选词。

　　我们注意到，每个短语可以再通过空格分为若干个单词，可以通过给每个单词赋予一个得分，通过累加得到每个短语的得分。一个关键点在于将这个短语中每个单词的共现关系考虑进去。最终定义的公式是:

wordScore = wordDegree(w) / wordFrequency(w)

即单词w的得分是该单词的度（是一个网络中的概念，每与一个单词共现在一个短语中，度就加1，考虑该单词本身）除以该单词的词频（该单词在该文档中出现的总次数）。

　　然后对于每个候选的关键短语，将其中每个单词的得分累加，并进行排序，RAKE将候选短语总数的前三分之一的认为是抽取出的关键词。

　　另外，值得说明的是，关于分数计算这部分，wordDegree(w)实际上是等于word和每一个phrase里面的词共现的次数加上word的frequency。具体算法请看附件论文，《Automatic Keyword Extraction from IndividualDocumen》