[数据挖掘] 50 文本挖掘 1 项-文档矩阵 与 TF-IDF(词频-逆文本频率)_哔哩哔哩_bilibili
[数据挖掘] 51 文本挖掘 2 逻辑回归分类任务 汽车和电子产品论坛文本数据
from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import matplotlib.pyplot as plt
from dmba import printTermDocumentMatrix,classificationSummary,gainsChart,liftChart
nltk.download('punkt')
# 项-文档
text = ['this is the first sentence.',
'this is a second sentence.',
'the third sentence is here.']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)
# 非标准文档
text = ['this is the first sentence!!',
'this is a second Sentence :)',
'the third sentence, is here ',
'forth of all sentences']
count_vect = CountVectorizer()
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)
# 预处理文本
text = ['this is the first sentence!!',
'this is a second sentence :)',
'the third sentence, is here ',
'forth of all sentences']
count_vect = CountVectorizer(token_pattern='[a-zA-Z!:)]+')
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)
# 文本压缩与分词
sw = list(sorted(ENGLISH_STOP_WORDS))
ncolumns = 10
nrows = 6
print(f"Frist {ncolumns*nrows} of {len(sw)} stopwords")
for i in range(0,len(sw[:(ncolumns*nrows)]),ncolumns):
print(''.join(word.ljust(13) for word in sw[i:(i+ncolumns)]))
# 文本压缩用于分词后的text
text = ['this is the first sentence!!',
'this is a second sentence :)',
'the third sentence, is here ',
'forth of all sentences']
class LemmaTokenizer(object):
def __init__(self):
self.stemmer=EnglishStemmer()
self.sw = set(ENGLISH_STOP_WORDS)
def __call__(self,doc):
return [self.stemmer.stem(t) for t in word_tokenize(doc)
if t.isalpha() and t not in self.sw]
count_vect = CountVectorizer(tokenizer = LemmaTokenizer())
counts = count_vect.fit_transform(text)
printTermDocumentMatrix(count_vect, counts)
# TF-IDF矩阵
text = ['this is the first sentence!!',
'this is a second sentence :)',
'the third sentence, is here ',
'forth of all sentences']
count_vect = CountVectorizer()
tfidftransformer = TfidfTransformer(smooth_idf=False,norm=None)
counts = count_vect.fit_transform(text)
tfidf = tfidftransformer.fit_transform(counts)
printTermDocumentMatrix(count_vect,tfidf)
# 案例实现-罗辑回归的分类任务
zippath = "/Users/zitongqiu/Documents/data mining/data/AutoAndElectronics.zip" # 指定zip文件的路径
corpus = []
label = []
with ZipFile(zippath) as rawData:
for info in rawData.infolist(): # <ZipInfo filename='AutoAndElectronics/rec.autos/102849' compress_type=deflate external_attr=0x20 file_size=2953 compress_size=1539>
if info.is_dir():
continue
label.append(1 if 'rec.autos' in info.filename else 0)
corpus.append(rawData.read(info))
# 对文本进行分词、词形还原和停用词过滤的处理。
class LemmaTokenizer(object):
def __init__(self):
self.stemmer=EnglishStemmer()
self.sw = set(ENGLISH_STOP_WORDS)
def __call__(self,doc):
return [self.stemmer.stem(t) for t in word_tokenize(doc)
if t.isalpha() and t not in self.sw]
preprocessor = CountVectorizer(tokenizer=LemmaTokenizer(),encoding='latin1')
preprocessedText = preprocessor.fit_transform(corpus)
print(preprocessedText)
tfidftransformer = TfidfTransformer()
tfidf = tfidftransformer.fit_transform(preprocessedText)
print(tfidf)
# sklearn的 潜在语义分析LSA Latent Semantic Analysis
# LSA Latent Semantic Analysis 潜在语义分析=隐性语义分析
# 潜在语义概念 = 主题
# 项-文档矩阵 = 词项-文档矩阵
svd = TruncatedSVD(20) # 将 项-文档矩阵 分解为 项-主题矩阵,主题重要性矩阵、以及主题-文档矩阵。
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd,normalizer)
lsa_tfidf = lsa.fit_transform(tfidf)
print(lsa_tfidf)
print(lsa_tfidf.shape)
X_train, X_test, y_train, y_test = train_test_split(lsa_tfidf,label,test_size=0.4,random_state=42)
logit_reg= LogisticRegression(solver='lbfgs')
logit_reg.fit(X_train,y_train)
classificationSummary(y_test,logit_reg.predict(X_test))
# 评估分类模型性能
logit_reg_pred = logit_reg.predict(X_test)
print("logit_reg_pred: \n",logit_reg_pred)
logit_reg_proba = logit_reg.predict_proba(X_test)
print(pd.DataFrame(logit_reg_proba))
logit_result = pd.DataFrame({'actual':y_test,
'p(0)':logit_reg_proba[:,0],
'p(1)':logit_reg_proba[:,1] ,
'predicted':logit_reg_pred,})
print(logit_result)
import matplotlib
matplotlib.use('TkAgg')
# 累积增益图gainsChart 和 十分位提升图liftChart
df = logit_result.sort_values(by=['p(1)'],ascending=False)
print(df)
actual_count = logit_result['actual'].value_counts()[1]
print(actual_count)
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(14,6))
gainsChart(df.actual,ax = axes[0]) # 累积增益图
liftChart(df['p(1)'],ax=axes[1]) # 十分位提升图
plt.show()