jieba源碼研讀筆記（十六） - 關鍵詞提取之tfidf.py檔初探

最新推荐文章于 2022-04-19 13:26:00 发布

keineahnung2345

最新推荐文章于 2022-04-19 13:26:00 发布

阅读量956

点赞数

分类专栏：機器學習 NLP jieba源碼研讀筆記文章标签： jieba nlp

本文链接：https://blog.csdn.net/keineahnung2345/article/details/88125671

版权

機器學習同时被 3 个专栏收录

23 篇文章 0 订阅

订阅专栏

NLP

18 篇文章 0 订阅

订阅专栏

jieba源碼研讀筆記

18 篇文章 2 订阅

订阅专栏

前言

jieba支持使用兩種算法做關鍵詞提取，包括TF-IDF及TextRank。
其中TF-IDF算法主要是在jieba/analyse/tfidf.py這個檔案中完成。
本篇將會介紹tfidf.py這個檔案的架構。

定義全局變數

# encoding=utf-8
from __future__ import absolute_import
import os
import jieba
import jieba.posseg
from operator import itemgetter

#代碼與_compat.py裡的get_module_res類似
#但get_module_res是回傳一個開啟的檔案
#_get_module_path則是回傳檔案的路徑
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
                                                 os.path.dirname(__file__), path))
_get_abs_path = jieba._get_abs_path

DEFAULT_IDF = _get_module_path("idf.txt")

測試_get_module_path：

print(DEFAULT_IDF) # D:\xxx\xxx\jieba\jieba\analyse\idf.txt

KeywordExtractor類別

class KeywordExtractor(object):
    #停用詞
    STOP_WORDS = set((
        "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
        "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
        "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
    ))
    
    #自定義停用詞，將stop_words_path裡的資料更新至self.stop_words
    def set_stop_words(self, stop_words_path):
        abs_path = _get_abs_path(stop_words_path)
        if not os.path.isfile(abs_path):
            raise Exception("jieba: file does not exist: " + abs_path)
        content = open(abs_path, 'rb').read().decode('utf-8')
        for line in content.splitlines():
            self.stop_words.add(line)

    def extract_tags(self, *args, **kwargs):
        raise NotImplementedError

測試set_stop_words：

import jieba.analyse

# jieba.analyse.set_stop_words會呼叫jieba.analyse.default_tfidf.stop_words
# 又因為jieba.analyse.default_tfidf是TFIDF類別的，而TFIDF是KeywordExtractor的子類別
# 所以會間接地使用到KeywordExtractor的set_stop_words函數

# 這裡需要使用.copy()否則sw_before仍會指向jieba.analyse.default_tfidf.stop_words
sw_before = jieba.analyse.default_tfidf.stop_words.copy()
jieba.analyse.set_stop_words('./extra_dict/stop_words.txt')
sw_after = jieba.analyse.default_tfidf.stop_words.copy()
print(sw_before) #默認是KeywordExtractor.STOP_WORDS
# {'all', 'in', 'have', 'we', 'then', 'for', 'an', 'you', 'with', 'it', 'on', 'this', 'can', 'and', 'or', 'the', 'not', 'to', 'of', 'be', 'by', 'has', 'are', 'which', 'from', 'as', 'one', 'that', 'if', 'is', 'at'}
print(sw_after - sw_before)
# {'了', '你們', '都', '我們', '與', '沒有', '他們', '著', '她們', '是', '妳們', '的', '或', '是否', '和', '就', '一個', '及', '而'}

IDFLoader類別

class IDFLoader(object):

    def __init__(self, idf_path=None):
        self.path = ""
        #idf_freq是一個字典，記錄各詞的頻率
        self.idf_freq = {}
        #各詞詞頻的中位數
        self.median_idf = 0.0
        if idf_path:
            #讀取idf_path，更新self.idf_freq及self.median_idf
            self.set_new_path(idf_path)

    def set_new_path(self, new_idf_path):
        if self.path != new_idf_path:
            self.path = new_idf_path
            content = open(new_idf_path, 'rb').read().decode('utf-8')
            self.idf_freq = {}
            for line in content.splitlines():
                word, freq = line.strip().split(' ')
                self.idf_freq[word] = float(freq)
            #取list的中位數
            self.median_idf = sorted(
                self.idf_freq.values())[len(self.idf_freq) // 2]

    def get_idf(self):
        return self.idf_freq, self.median_idf

測試set_new_path：

import jieba.analyse

# jieba.analyse.set_idf_path指向default_tfidf.set_idf_path
# 其中default_tfidf.set_idf_path是一個TFIDF類的物件
# 在default_tfidf.set_idf_path中會呼叫self.idf_loader.set_new_path
jieba.analyse.set_idf_path('./jieba/analyse/idf.txt')
median_idf = jieba.analyse.default_tfidf.median_idf
print(median_idf) #11.9547675029

idf_freq = jieba.analyse.default_tfidf.idf_freq
print(sorted(idf_freq.items(), key=lambda x: x[1])[:10])
# IDF最低，也就是最常出現在各document裡的詞
"""
[('的', 0.88474202619),
 ('在', 1.51197516384),
 ('了', 1.51982850882),
 ('是', 1.69402517458),
 ('和', 1.82606426172),
 ('也', 2.08395811277),
 ('有', 2.21471526602),
 ('他', 2.3719182087),
 ('为', 2.44163831356),
 ('就', 2.50115607514)]
"""

TFIDF類別

TFIDF類包含了extract_tags這個負責實現核心算法的函數。
這個類別與上述兩個類別有何關聯呢?

TFIDF類是KeywordExtractor的子類別
TFIDF有一個名為idf_loader的屬性，它是一個IDFLoader類的物件。

以下是它所擁有的函數：

class TFIDF(KeywordExtractor):

    def __init__(self, idf_path=None):
        #...

    def set_idf_path(self, idf_path):
        #...

    def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        #...