python爬取复仇者联盟3豆瓣影评

from urllib import request
from bs4 import BeautifulSoup as bs

for i in range (0,11):
    res = request.urlopen('https://movie.douban.com/subject/24773958/comments?start=' + str(20*i) + '&limit=20&sort=new_score&status=P&percent_type=')
    html_data = res.read().decode('utf-8')

    Soup = bs(html_data, 'html.parser')
    comments = Soup.find_all('div', id='comments')
    comments_content = comments[0].find_all('p')
    for j in range(0, 20):
        text = str(comments_content[j])
        f = open('movie_comments.txt', 'a', encoding='utf-8')
        f.write(text)
        f.close()

将爬取的内容保存为txt,使用结巴分词包对评论进行词频统计

import re
import jieba
import pandas as pd
import numpy as np
from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud  # 词云包
import matplotlib


f = open('movie_comments.txt', 'r', encoding='utf-8')
content = f.read()
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, content)
# print(filterdata)
cleaned_comments = ''.join(filterdata)
segment = jieba.lcut(cleaned_comments)
# print(segment)
words_df = pd.DataFrame({'segment':segment})
# print(words_df.head())
stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='gbk')#quoting=3全不引用
words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
# print(words_df.head())
words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":np.size})
words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
print(words_stat.head())
print(words_stat)

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)  # 指定字体类型、字体大小和字体颜色
word_frequence = {x[0]: x[1] for x in words_stat.head(1000).values}
word_frequence_list = []
for key in word_frequence:
    temp = (key, word_frequence[key])
    word_frequence_list.append(temp)

wordcloud = wordcloud.fit_words(dict(word_frequence_list))
plt.imshow(wordcloud)
plt.show()
f.close()

最终结果如图所示

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值