TF-IDF关键词抽取Python实现

原理

TF(Term Frequency):词频

T F = 该 词 频 数 文 档 词 语 总 数 TF = \frac{该词频数}{文档词语总数} TF=

IDF(Inverse Document Frequency):逆文本频率指数

I D F = log ⁡ ( 文 档 总 数 出 现 该 词 文 档 数 + 1 ) IDF = \log(\frac{文档总数}{出现该词文档数+1}) IDF=log(+1)

调用jieba(免训练)

from jieba.analyse import tfidf
sentence = '佛山市科技局发布关于发展佛山市人工智能项目的通知'
print(tfidf(sentence))
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn')))  # 按词性筛选
print(tfidf(sentence, allowPOS=('n', 'ns', 'v', 'vn'), withFlag=True))  # 返回词性
print(tfidf(sentence, withWeight=True))  # 返回权重

打印结果

['佛山市', '科技局', '人工智能', '通知', '发布', '关于', '项目', '发展']
['佛山市', '科技局', '人工智能', '通知', '发布', '项目', '发展']
[pair('佛山市', 'ns'), pair('科技局', 'n'), pair('人工智能', 'n'), pair('通知', 'v'), pair('发布', 'v'), pair('项目', 'n'), pair('发展', 'vn')]
[('佛山市', 2.2638012411777777), ('科技局', 1.3454353536333334), ('人工智能', 1.0508918217211112), ('通知', 0.6714233436266667), ('发布', 0.5657954481322222), ('关于', 0.5532763439699999), ('项目', 0.5425367102355555), ('发展', 0.39722939449333333)]

Python手写

from collections import Counter
from math import log10
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 1 and w.flag in FLAGS:
                yield w.word

class TFIDF:
    def __init__(self):
        self.idf = None
        self.idf_max = None

    def fit(self, texts):
        texts = [set(cut(text)) for text in texts]
        lent = len(texts)
        words = set(w for t in texts for w in t)
        self.idf = {w: log10(lent/(sum((w in t)for t in texts)+1)) for w in words}
        self.idf_max = log10(lent)
        return self

    def get_idf(self, word):
        return self.idf.get(word, self.idf_max)

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

tfidf = TFIDF().fit(['奶茶', '巧克力奶茶', '巧克力酸奶', '巧克力', '巧克力']*2)
print(tfidf.extract('酸奶巧克力奶茶'))

sklearn

from re import split
from jieba.posseg import dt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def cut(text):
    for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip()):
        for w in dt.cut(sentence):
            if len(w.word) > 2 and w.flag in FLAGS:
                yield w.word

class TFIDF:
    def __init__(self, idf):
        self.idf = idf

    @classmethod
    def train(cls, texts):
        model = TfidfVectorizer(tokenizer=cut)
        model.fit(texts)
        idf = {w: model.idf_[i] for w, i in model.vocabulary_.items()}
        return cls(idf)

    def get_idf(self, word):
        return self.idf.get(word, max(self.idf.values()))

    def extract(self, text, top_n=10):
        counter = Counter()
        for w in cut(text):
            counter[w] += self.get_idf(w)
        return [i[0] for i in counter.most_common(top_n)]

gensim

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from re import split
from jieba.posseg import dt

FLAGS = set('a an b f i j l n nr nrfg nrt ns nt nz s t v vi vn z eng'.split())

def lcut(text):
    return [
        w.word for sentence in split('[^a-zA-Z0-9\u4e00-\u9fa5]+', text.strip())
        for w in dt.cut(sentence)if len(w.word) > 2 and w.flag in FLAGS]

class TFIDF:
    def __init__(self, dictionary, model):
        self.model = model
        self.doc2bow = dictionary.doc2bow
        self.id2word = {i: w for w, i in dictionary.token2id.items()}

    @classmethod
    def train(cls, texts):
        texts = [lcut(text) for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        model = TfidfModel(corpus)
        return cls(dictionary, model)

    def extract(self, text, top_n=10):
        vector = self.doc2bow(lcut(text))
        key_words = sorted(self.model[vector], key=lambda x: x[1], reverse=True)
        return [self.id2word[i] for i, j in key_words][:top_n]

手写、sklearn、gensim结果比较

from time import time
t0 = time()
with open('policy.txt', encoding='utf-8')as f:
    _texts = f.read().strip().split('\n\n')
tfidf = TFIDF.train(_texts)
for _text in _texts:
    print(tfidf.extract(_text))
print(time() - t0)

结果打印:
速度:手写≈gensim>sklearn
关键词:三者Top2完全相同,之后不完全相同

  • 3
    点赞
  • 24
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值