[数据挖掘] 50 文本挖掘项-文档矩阵与 TF-IDF(词频-逆文本频率)

波哥大与金迪奥

已于 2023-08-16 13:23:25 修改

阅读量131

点赞数

分类专栏：数据挖掘与机器学习文章标签：数据挖掘矩阵 tf-idf

于 2023-08-08 23:48:07 首次发布

本文链接：https://blog.csdn.net/uforfor1/article/details/132178726

版权

数据挖掘与机器学习专栏收录该内容

8 篇文章 2 订阅

订阅专栏

[数据挖掘] 50 文本挖掘 1 项-文档矩阵与 TF-IDF(词频-逆文本频率)_哔哩哔哩_bilibili

[数据挖掘] 51 文本挖掘 2 逻辑回归分类任务汽车和电子产品论坛文本数据

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import matplotlib.pyplot as plt
from dmba import printTermDocumentMatrix,classificationSummary,gainsChart,liftChart
nltk.download('punkt')

# 项-文档
text = ['this is the first sentence.',
        'this is a second sentence.',
        'the third sentence is here.']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 非标准文档
text = ['this is the first sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 预处理文本
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 文本压缩与分词
sw = list(sorted(ENGLISH_STOP_WORDS))
ncolumns = 10
nrows = 6
print(f"Frist {ncolumns*nrows} of {len(sw)} stopwords")
for i in range(0,len(sw[:(ncolumns*nrows)]),ncolumns):
        print(''.join(word.ljust(13) for word in sw[i:(i+ncolumns)]))

# 文本压缩用于分词后的text
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
class LemmaTokenizer(object):
        def __init__(self):
                self.stemmer=EnglishStemmer()
                self.sw = set(ENGLISH_STOP_WORDS)
        def __call__(self,doc):
                return [self.stemmer.stem(t) for t in word_tokenize(doc)
                        if t.isalpha() and t not in self.sw]
count_vect = CountVectorizer(tokenizer = LemmaTokenizer())
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)


# TF-IDF矩阵
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer()
tfidftransformer = TfidfTransformer(smooth_idf=False,norm=None)
counts = count_vect.fit_transform(text)
tfidf = tfidftransformer.fit_transform(counts)
printTermDocumentMatrix(count_vect,tfidf)


# 案例实现-罗辑回归的分类任务
zippath = "/Users/zitongqiu/Documents/data mining/data/AutoAndElectronics.zip" # 指定zip文件的路径
corpus = []
label = []

with ZipFile(zippath) as rawData:
        for info in rawData.infolist(): # <ZipInfo filename='AutoAndElectronics/rec.autos/102849' compress_type=deflate external_attr=0x20 file_size=2953 compress_size=1539>
                if info.is_dir():
                        continue
                label.append(1 if 'rec.autos' in info.filename else 0)
                corpus.append(rawData.read(info))

# 对文本进行分词、词形还原和停用词过滤的处理。
class LemmaTokenizer(object):
        def __init__(self):
                self.stemmer=EnglishStemmer()
                self.sw = set(ENGLISH_STOP_WORDS)
        def __call__(self,doc):
                return [self.stemmer.stem(t) for t in word_tokenize(doc)
                        if t.isalpha() and t not in self.sw]
preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(),encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)
print(preprocessedText)
tfidftransformer = TfidfTransformer()
tfidf = tfidftransformer.fit_transform(preprocessedText)
print(tfidf)

# sklearn的 潜在语义分析LSA Latent Semantic Analysis
# LSA Latent Semantic Analysis 潜在语义分析=隐性语义分析
# 潜在语义概念 = 主题
# 项-文档矩阵 = 词项-文档矩阵
svd = TruncatedSVD(20) # 将 项-文档矩阵 分解为 项-主题矩阵，主题重要性矩阵、以及主题-文档矩阵。
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd,normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)
print(lsa_tfidf)
print(lsa_tfidf.shape)

X_train, X_test, y_train, y_test = train_test_split(lsa_tfidf,label,test_size=0.4,random_state=42)
logit_reg= LogisticRegression(solver='lbfgs')
logit_reg.fit(X_train,y_train)
classificationSummary(y_test,logit_reg.predict(X_test))

# 评估分类模型性能
logit_reg_pred = logit_reg.predict(X_test)
print("logit_reg_pred: \n",logit_reg_pred)
logit_reg_proba = logit_reg.predict_proba(X_test)
print(pd.DataFrame(logit_reg_proba))
logit_result = pd.DataFrame({'actual':y_test,
                             'p(0)':logit_reg_proba[:,0],
                             'p(1)':logit_reg_proba[:,1] ,
                             'predicted':logit_reg_pred,})
print(logit_result)

import matplotlib
matplotlib.use('TkAgg')

# 累积增益图gainsChart 和 十分位提升图liftChart
df = logit_result.sort_values(by=['p(1)'],ascending=False)
print(df)
actual_count = logit_result['actual'].value_counts()[1]
print(actual_count)

fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(14,6))
gainsChart(df.actual,ax = axes[0]) # 累积增益图
liftChart(df['p(1)'],ax=axes[1]) # 十分位提升图
plt.show()

波哥大与金迪奥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
[数据挖掘] 50 文本挖掘项-文档矩阵与 TF-IDF(词频-逆文本频率)

[数据挖掘] 50 文本挖掘 1 项-文档矩阵与 TF-IDF (词频-逆文本频率)[数据挖掘] 51 文本挖掘 2 逻辑回归分类任务汽车和电子产品论坛文本数据
复制链接

扫一扫