[数据挖掘] 50 文本挖掘 项-文档矩阵 与 TF-IDF(词频-逆文本频率)

 [数据挖掘] 50 文本挖掘 1 项-文档矩阵 与 TF-IDF(词频-逆文本频率)_哔哩哔哩_bilibili

 
[数据挖掘] 51 文本挖掘 2 逻辑回归分类任务 汽车和电子产品论坛文本数据

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import matplotlib.pyplot as plt
from dmba import printTermDocumentMatrix,classificationSummary,gainsChart,liftChart
nltk.download('punkt')

# 项-文档
text = ['this is the first sentence.',
        'this is a second sentence.',
        'the third sentence is here.']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 非标准文档
text = ['this is the first sentence!!',
        'this is a second Sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 预处理文本
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)

# 文本压缩与分词
sw = list(sorted(ENGLISH_STOP_WORDS))
ncolumns = 10
nrows = 6
print(f"Frist {ncolumns*nrows} of {len(sw)} stopwords")
for i in range(0,len(sw[:(ncolumns*nrows)]),ncolumns):
        print(''.join(word.ljust(13) for word in sw[i:(i+ncolumns)]))

# 文本压缩用于分词后的text
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
class LemmaTokenizer(object):
        def __init__(self):
                self.stemmer=EnglishStemmer()
                self.sw = set(ENGLISH_STOP_WORDS)
        def __call__(self,doc):
                return [self.stemmer.stem(t) for t in word_tokenize(doc)
                        if t.isalpha() and t not in self.sw]
count_vect = CountVectorizer(tokenizer = LemmaTokenizer())
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)


# TF-IDF矩阵
text = ['this is the first sentence!!',
        'this is a second sentence :)',
        'the third sentence, is here ',
        'forth of all sentences']
count_vect = CountVectorizer()
tfidftransformer = TfidfTransformer(smooth_idf=False,norm=None)
counts = count_vect.fit_transform(text)
tfidf = tfidftransformer.fit_transform(counts)
printTermDocumentMatrix(count_vect,tfidf)


# 案例实现-罗辑回归的分类任务
zippath = "/Users/zitongqiu/Documents/data mining/data/AutoAndElectronics.zip" # 指定zip文件的路径
corpus = []
label = []

with ZipFile(zippath) as rawData:
        for info in rawData.infolist(): # <ZipInfo filename='AutoAndElectronics/rec.autos/102849' compress_type=deflate external_attr=0x20 file_size=2953 compress_size=1539>
                if info.is_dir():
                        continue
                label.append(1 if 'rec.autos' in info.filename else 0)
                corpus.append(rawData.read(info))

# 对文本进行分词、词形还原和停用词过滤的处理。
class LemmaTokenizer(object):
        def __init__(self):
                self.stemmer=EnglishStemmer()
                self.sw = set(ENGLISH_STOP_WORDS)
        def __call__(self,doc):
                return [self.stemmer.stem(t) for t in word_tokenize(doc)
                        if t.isalpha() and t not in self.sw]
preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(),encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)
print(preprocessedText)
tfidftransformer = TfidfTransformer()
tfidf = tfidftransformer.fit_transform(preprocessedText)
print(tfidf)

# sklearn的 潜在语义分析LSA Latent Semantic Analysis
# LSA Latent Semantic Analysis 潜在语义分析=隐性语义分析
# 潜在语义概念 = 主题
# 项-文档矩阵 = 词项-文档矩阵
svd = TruncatedSVD(20) # 将 项-文档矩阵 分解为 项-主题矩阵,主题重要性矩阵、以及主题-文档矩阵。
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd,normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)
print(lsa_tfidf)
print(lsa_tfidf.shape)

X_train, X_test, y_train, y_test = train_test_split(lsa_tfidf,label,test_size=0.4,random_state=42)
logit_reg= LogisticRegression(solver='lbfgs')
logit_reg.fit(X_train,y_train)
classificationSummary(y_test,logit_reg.predict(X_test))

# 评估分类模型性能
logit_reg_pred = logit_reg.predict(X_test)
print("logit_reg_pred: \n",logit_reg_pred)
logit_reg_proba = logit_reg.predict_proba(X_test)
print(pd.DataFrame(logit_reg_proba))
logit_result = pd.DataFrame({'actual':y_test,
                             'p(0)':logit_reg_proba[:,0],
                             'p(1)':logit_reg_proba[:,1] ,
                             'predicted':logit_reg_pred,})
print(logit_result)

import matplotlib
matplotlib.use('TkAgg')

# 累积增益图gainsChart 和 十分位提升图liftChart
df = logit_result.sort_values(by=['p(1)'],ascending=False)
print(df)
actual_count = logit_result['actual'].value_counts()[1]
print(actual_count)

fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(14,6))
gainsChart(df.actual,ax = axes[0]) # 累积增益图
liftChart(df['p(1)'],ax=axes[1]) # 十分位提升图
plt.show()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值