TFIDF词频-逆向文件频率算法Python

# -*- coding: utf-8 -*-
import math
import re
import codecs
from operator import itemgetter
import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
class TfIdf:
    def __init__(self, corpus_filename=None, stopword_filename=None,
                 DEFAULT_IDF=1.5):
        self.num_docs = 0
        self.term_num_docs = {}  # term : num_docs_containing_term
        self.stopwords = stopwords.words('english')
        self.idf_default = DEFAULT_IDF
        if corpus_filename:
            self.merge_corpus_document(corpus_filename)

    def get_tokens(self, str):
        return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())

    def merge_corpus_document(self, corpus_filename):
        corpus_file = codecs.open(corpus_filename, "r", encoding='utf-8')
        line = corpus_file.readline()
        self.num_docs += int(line.strip())
        for line in corpus_file:
            tokens = line.rsplit(":", 1)
            term = tokens[0].strip()
            try:
                frequency = int(tokens[1].strip())
            except IndexError as err:
                if line in ("", "\t"):
                    # catch blank lines
                    print("line is blank")
                    continue
                else:
                    raise
            if self.term_num_docs.__contains__(term):
                self.term_num_docs[term] += frequency
            else:
                self.term_num_docs[term] = frequency

    def add_input_document(self, input):
        self.num_docs += 1
        words = set(self.get_tokens(input))
        for word in words:
            if word in self.term_num_docs:
                self.term_num_docs[word] += 1
            else:
                self.term_num_docs[word] = 1

    def add_plaintext_document(self, plaintext_filename):
        self.num_docs += 1
        input_file = codecs.open(plaintext_filename, "r", encoding='utf-8')
        allline = ""
        for line in input_file:
            allline = allline + line.strip()
        tokens = self.get_tokens(allline.strip())
        tokens_set = set(tokens)
        for word in tokens_set:
            if self.term_num_docs.__contains__(word):
                self.term_num_docs[word] += 1
            else:
                self.term_num_docs[word] = 1
    def save_corpus_to_file(self, idf_filename, stopword_filename,
                            STOPWORD_PERCENTAGE_THRESHOLD=0.01):
        output_file = codecs.open(idf_filename, "w", encoding='utf-8')

        output_file.write(str(self.num_docs) + "\n")
        for term, num_docs in self.term_num_docs.items():
            output_file.write(term + ": " + str(num_docs) + "\n")

        sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1),
                              reverse=True)
        stopword_file = open(stopword_filename, "w")
        for term, num_docs in sorted_terms:
            if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs:
                break
            stopword_file.write(term + "\n")

    def get_num_docs(self):
        return self.num_docs

    def get_idf(self, term):
        if term in self.stopwords:
            return 0
        if not term in self.term_num_docs:
            return self.idf_default
        return math.log(float(1 + self.get_num_docs()) /
                        (1 + self.term_num_docs[term]))

    def get_str_keywords(self, curr_doc):
        tfidf = {}
        tokens = self.get_tokens(curr_doc)
        tokens_set = set(tokens)
        for word in tokens_set:
            mytf = float(tokens.count(word)) / len(tokens_set)
            myidf = self.get_idf(word)
            tfidf[word] = mytf * myidf
        return sorted(tfidf.items(), key=itemgetter(1), reverse=True)

    def get_str_keywords2(self, curr_doc2):
        tfidf = {}
        string = ''
        with open(curr_doc2, 'r', encoding='utf-8') as file_to_read:
            lines = file_to_read.readline()  # 整行读取数据
            string += lines
        tokens = self.get_tokens(string)
        tokens_set = set(tokens)
        for word in tokens_set:
            mytf = float(tokens.count(word)) / len(tokens_set)
            myidf = self.get_idf(word)
            tfidf[word] = mytf * myidf
        return sorted(tfidf.items(), key=itemgetter(1), reverse=True)

print(" We are now building corpus.")
p = TfIdf(corpus_filename='corpus_initial.txt', stopword_filename='stopword.txt',
          DEFAULT_IDF=1.5)
print("We imported ", p.num_docs, " docs from 'corpus_initial.txt'.")
print("The Term_num_docs are", p.term_num_docs)
print('p.get_idf("i"):', p.get_idf("i"), 'p.get_idf("bioinformatics"):', p.get_idf("bioinformatics"))

print("We are trying to add a new corpus in the system.")
p.merge_corpus_document("corpus_added.txt")
print("We imported ", p.num_docs, " docs by adding 'corpus_added.txt'. ")
print("The Term_num_docs are", p.term_num_docs)
print('p.get_idf("i"):', p.get_idf("i"), 'p.get_idf("bioinformatics"):', p.get_idf("bioinformatics"))

print("We are trying to add an input sentence in the system.")
p.add_input_document("I also love statistics.")
print("We imported ", p.num_docs, " docs by adding an input sentence 'I also love statistics'. ")
print("The Term_num_docs are", p.term_num_docs)
print('p.get_idf("i"):', p.get_idf("i"), 'p.get_idf("bioinformatics"):', p.get_idf("bioinformatics"))

print("We are trying to add a local document in the system.")
p.add_plaintext_document('a.txt')
print("We imported ", p.num_docs, " docs by adding the plain text, 'plantext_added.txt'. ")
print("The Term_num_docs are", p.term_num_docs)
print('p.get_idf("i"):', p.get_idf("i"), 'p.get_idf("bioinformatics"):', p.get_idf("bioinformatics"))

print("The idf computation is over, and the corpus construction and management is over.")
print("We are showing the TF*IDF computation next.")
teststr = "I like math and chemistry so much."
print(" For the input sentence '", teststr, "' being a document, ")
out = p.get_str_keywords("I like math and chemistry so much.\n")
print("TFIDF is:", out)
out2 = p.get_str_keywords2('a.txt')
print("TFIDF2 is:", out2)
print("For the input plain text file 'plaintext_TFIDF_comp.txt', please compute the TFIDF.Please start coding from line 200.")
p.get_tokens("Trump will be abandoned by the public. Bio-NLP is cute. BRCA1/2")

#语料
string = []
with open('a.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()  # 整行读取数据
    string += lines
#print(string)
#第2个fit_transform是计算tf-idf,第1个fit_transform是将文本转为词频矩阵
#将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer()
#计算个词语出现的次数
X = vectorizer.fit_transform(string)
#获取词袋中所有文本关键词
word = vectorizer.get_feature_names()
#print (word)
#查看词频结果
print (X.toarray())
transformer = TfidfTransformer()
print (transformer)
#将词频矩阵X统计成TF-IDF值
tfidf = transformer.fit_transform(X)
#查看数据结构 tfidf[i][j]表示i类文本中的tf-idf权重
print (tfidf.toarray())
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
    for j in range(len(word)):
        print (word[j],weight[i][j])
       
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
#首先,构建语料库corpus
sents=['this is sentence one','this is sentence two','this is sentence three']
sents=[word_tokenize(sent) for sent in sents] #对每个句子进行分词
print(sents)  #输出分词后的结果
corpus=TextCollection(sents)  #构建语料库
print(corpus)  #输出语料库
#计算语料库中"one"的tf值
tf=corpus.tf('one',corpus)    # 1/12
print(tf)
#计算语料库中"one"的idf值
idf=corpus.idf('one')      #log(3/1)
print(idf)
#计算语料库中"one"的tf-idf值
tf_idf=corpus.tf_idf('one',corpus)
print(tf_idf)

import jieba.analyse
text='关键词是能够表达文档中心内容的词语,常用于计算机系统标引论文内容特征、 \
信息检索、系统汇集以供读者检阅。关键词提取是文本挖掘领域的一个分支,是文本检索、\
文档比较、摘要生成、文档分类和聚类等文本挖掘研究的基础性工作'
keywords=jieba.analyse.extract_tags(text, topK=5, withWeight=True, allowPOS=())
print(keywords)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值