Python金融数据挖掘 第7章 复习思考题 3

3.有‘手机垃圾短信’数据集,将其放在文件中:sms_spam.csv,该文件共有5537行,2列,分别是类型(type,ham为非垃圾短信,spam为垃圾短信)和内容(text,短信的具体内容)。

(1)对该‘手机垃圾短信”数据集进行文本挖掘。

(2)划分非垃圾短信ham和垃圾短信spam,并分别作词云图。

'''
3.有‘手机垃圾短信’数据集,将其放在文件中:sms_spam.csv,该文件共有5537行,2列,
分别是类型(type,ham为非垃圾短信,spam为垃圾短信)和内容(text,短信的具体内容)。
(1)对该‘手机垃圾短信”数据集进行文本挖掘。
(2)划分非垃圾短信ham和垃圾短信spam,并分别作词云图。
'''

# 库
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

import jieba
import jieba.analyse
from os import path
import imageio
from ipykernel import kernelapp as app
import matplotlib as mpl
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator

# NLTK数据
nltk.download('punkt')
# 数据
sms = pd.read_csv('sms_spam.csv')
# 停用词
stop=pd.read_csv('C:/Users/86186/Desktop/大二 下/数据挖掘/第7章/Estopwords.txt',sep='bucunzai',encoding='utf_8',header=None)
stopwords=[' ']+list(stop[0])

# 单行去除
word2 = [] 
for i in range(0,len(sms),1):
    j = word_tokenize(sms['text'][i])
    word2.append(j)
word22 = []
for j in range(0,len(word2),1):
    word21 = []
    for n in word2[j]:
        if n not in stopwords:
            word21.append(n)
    word22.insert(i,word21)
print('单行去除停用词看word22')

# 词干提取
word3 = []
stemmer=PorterStemmer()
for i in range(0,len(word22),1):
    word31 = []
    for j in word22[i]:
        word31.append(stemmer.stem(j))
    word3.insert(i,word31)
print('单行词干提取看word3')

# 词频(前25)
word_words=[x for x in word12 if len(x)>=2]
counter1 = Counter(word_words)
counter2 = Counter(word_words).most_common(25)
print(counter2)


# (2)
# 分类
ham = []
for i in range(0,len(sms),1):
    if sms['type'][i] == 'ham':
        ham.append(sms['text'][i])
spam = []
for i in range(0,len(sms),1):
    if sms['type'][i] == 'spam':
        spam.append(sms['text'][i])
        
ham1 = " ".join(ham)
ham11 = word_tokenize(ham1)
ham12 = []
for word in ham11:
    if word not in stopwords:
        ham12.append(word)
print('整体去除停用词看ham12')

spam1 = " ".join(spam)
spam11 = word_tokenize(spam1)
spam12 = []
for word in spam11:
    if word not in stopwords:
        spam12.append(word)
print('整体去除停用词看spam12')



# 使用词底图
maskImg=imageio.imread('C:/Users/86186/Desktop/大二 下/数据挖掘/第7章/renwu.png')
wc=WordCloud(font_path='C:/Windows/Fonts/simhei.ttf',background_color='white',
              max_words=10000,mask=maskImg,max_font_size=120,min_font_size=10,
              random_state=42,width=1200,height=900)

# 词云函数
def word__cloud(text):
    # 设置字体及规格
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize = (10,4))
    maskImg=imageio.imread('C:/Users/86186/Desktop/大二 下/数据挖掘/第7章/renwu.png')
    wc=WordCloud(font_path='C:/Windows/Fonts/simhei.ttf',background_color='white',
                  max_words=10000,mask=maskImg,max_font_size=120,min_font_size=10,
                  random_state=42,width=1200,height=900)
    #生成text词云图
    wc.generate(text) 
    image_colors=ImageColorGenerator(maskImg)
    print(image_colors)
    plt.imshow(wc)
    plt.axis('off')
    plt.title('词云图',fontsize = 20)
    plt.show()
    wc.to_file('词云图.jpg')
    
word__cloud(ham1)
word__cloud(spam1)

 

  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值