TF-IDF关键词抽取Python实现

最新推荐文章于 2024-01-22 12:30:03 发布

小基基o_O

最新推荐文章于 2024-01-22 12:30:03 发布

阅读量2.7k

点赞数 3

分类专栏：自然语言处理文章标签： python sklearn tf-idf

本文链接：https://blog.csdn.net/Yellow_python/article/details/103480098

版权

自然语言处理专栏收录该内容

25 篇文章 42 订阅

订阅专栏

文章目录

原理
调用jieba（免训练）
Python手写
sklearn
gensim
手写、sklearn、gensim结果比较

原理

TF（Term Frequency）：词频

$\frac{该词频数}{文档词语总数}$

IDF（Inverse Document Frequency）：逆文本频率指数

$\log(\frac{文档总数}{出现该词文档数+1})$

调用jieba（免训练）

from jieba.analyse import tfidf
sentence = '佛山市科技局发布关于发展佛山市人工智能项目的通知'
print(tfidf(sentence))
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn')))  # 按词性筛选
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn'), withFlag=True))  # 返回词性
print(tfidf(sentence, withWeight=True))  # 返回权重

打印结果

['佛山市', '科技局', '人工智能', '通知', '发布', '关于', '项目', '发展']
['佛山市', '科技局', '人工智能', '通知', '发布', '项目', '发展']
[pair('佛山市', 'ns'), pair('科技局', 'n'), pair('人工智能', 'n'), pair('通知', 'v'), pair('发布', 'v'), pair('项目', 'n'), pair('发展', 'vn')]
[('佛山市', 2.2638012411777777), ('科技局', 1.3454353536333334), ('人工智能', 1.0508918217211112), ('通知', 0.6714233436266667), ('发布', 0.5657954481322222), ('关于', 0.5532763439699999), ('项目', 0.5425367102355555), ('发展', 0.39722939449333333)]

Python手写

from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 1 and w.flag in FLAGS:
                yield w.word

class TFIDF:
    def __init__(self):
        self.idf = None
        self.idf_max = None

    def fit(self, texts):
        texts = [set(cut(text)) for text in texts]
        lent = len(texts)
        words = set(w for t in texts for w in t)
        self.idf = {w: log10(lent/(sum((w in t)for t in texts)+1)) for w in words}
        self.idf_max = log10(lent)
        return self

    def get_idf(self, word):
        return self.idf.get(word, self.idf_max)

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

tfidf = TFIDF().fit(['奶茶', '巧克力奶茶', '巧克力酸奶', '巧克力', '巧克力']*2)
print(tfidf.extract('酸奶巧克力奶茶'))

sklearn

from re import split
from jieba.posseg import dt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 2 and w.flag in FLAGS:
                yield w.word

class TFIDF:
    def __init__(self, idf):
        self.idf = idf

    @classmethod
    def train(cls, texts):
        model = TfidfVectorizer(tokenizer=cut)
        model.fit(texts)
        idf = {w: model.idf_[i] for w, i in model.vocabulary_.items()}
        return cls(idf)

    def get_idf(self, word):
        return self.idf.get(word, max(self.idf.values()))

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

gensim

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def lcut(text):
    return [
        w.word for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip())
        for w in dt.cut(sentence)if len(w.word) > 2 and w.flag in FLAGS]

class TFIDF:
    def __init__(self, dictionary, model):
        self.model = model
        self.doc2bow = dictionary.doc2bow
        self.id2word = {i: w for w, i in dictionary.token2id.items()}

    @classmethod
    def train(cls, texts):
        texts = [lcut(text) for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        model = TfidfModel(corpus)
        return cls(dictionary, model)

    def extract(self, text, top_n=10):
        vector = self.doc2bow(lcut(text))
        key_words = sorted(self.model[vector], key=lambda x: x[1], reverse=True)
        return [self.id2word[i] for i, j in key_words][:top_n]

手写、sklearn、gensim结果比较

from time import time
t0 = time()
with open('policy.txt', encoding='utf-8')as f:
    _texts = f.read().strip().split('\n\n')
tfidf = TFIDF.train(_texts)
for _text in _texts:
    print(tfidf.extract(_text))
print(time() - t0)