爬虫案例
效果图:
代码实现:
#! /usr/local/bin/python3
# -*- coding: utf-8 -*-
'''
Author: elson
Desc: 电视剧《和平饭店》的豆瓣评论
'''
import re
import jieba
import os
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from lxml import etree
import requests
from wordcloud import WordCloud
def get_comment_detail(url):
response = requests.get(url)
resHtml = response.text
# print(resHtml)
# with open('./html/peace_hotel_detail_comment.html', 'w') as f:
# f.write(response.text)
html = etree.HTML(resHtml)
detail = html.xpath('.//div[@class="review-content clearfix"]')[0].xpath('string(.)')
return detail
def request_page(url):
comment_list = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
response = requests.get(url, headers)
resHtml = response.text
# print(resHtml)
# with open('./html/peace_hotel.html', 'w') as f:
# f.write(response.text)
html = etree.HTML(resHtml)
result = html.xpath('//div[@class="main-bd"]/h2/a')
for site in result:
detail_url = site.attrib['href']
print(detail_url)
detail = get_comment_detail(detail_url)
comment_list.append(detail)
print('request_page....')
page = html.xpath('//span[@class="next"]')
if page and page[0].xpath('./a'):
next_start = page[0].xpath('./a')[0].attrib['href']
else:
next_start = ''
return (comment_list, next_start)
def get_comment_lists():
comment_list = []
url = "https://movie.douban.com/subject/26828285/reviews"
result = request_page(url)
comment_list.extend(result[0])
while result[1]:
result = request_page(url + result[1])
comment_list.extend(result[0])
return comment_list
def main():
#1. 数据获取
comment_list = get_comment_lists()
# print(comment_list)
#2. 数据清洗
# 将列表中的数据转换为字符串
comments = ''
for k in range(len(comment_list)):
comments = comments + (str(comment_list[k])).strip()
# 使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
#3. 数据分析
# 使用结巴分词进行中文分词
segment = jieba.lcut(cleaned_comments)
words_df = pd.DataFrame({'segment': segment})
# 将工作目录切换到指定目录
print(os.getcwd())
os.chdir('config')
print(os.getcwd())
# 去掉停用词
stopwords = pd.read_csv("stop_words.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8') # quoting=3全部引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
# 统计词频
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
print(words_stat.head())
# 用词云进行显示
wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
word_frequence = {x[0]: x[1] for x in words_stat.head(300).values}
print(word_frequence)
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
wordcloud = wordcloud.fit_words(word_frequence)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
if __name__ == '__main__':
main()
segment | 计数 |
---|---|
饭店 | 306 |
和平 | 270 |
都 | 248 |
一个 | 212 |
人 | 207 |
资源下载链接:停用词