Python(爬虫) — 爬取《和平饭店》的豆瓣评论

最新推荐文章于 2024-08-24 17:01:15 发布

Elson_6

最新推荐文章于 2024-08-24 17:01:15 发布

阅读量611

点赞数

分类专栏： Python & 爬虫

本文链接：https://blog.csdn.net/love667767/article/details/79241302

版权

Python & 爬虫专栏收录该内容

19 篇文章 2 订阅

订阅专栏

爬虫案例

效果图：

这里写图片描述

代码实现：

#! /usr/local/bin/python3
# -*- coding: utf-8 -*-

'''
Author: elson
Desc: 电视剧《和平饭店》的豆瓣评论
'''
import re

import jieba
import os

import numpy
import pandas as pd
import matplotlib.pyplot as plt
from lxml import etree

import requests
from wordcloud import WordCloud


def get_comment_detail(url):

    response = requests.get(url)
    resHtml = response.text

    # print(resHtml)
    # with open('./html/peace_hotel_detail_comment.html', 'w') as f:
    #     f.write(response.text)

    html = etree.HTML(resHtml)
    detail = html.xpath('.//div[@class="review-content clearfix"]')[0].xpath('string(.)')
    return detail

def request_page(url):
    comment_list = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    response = requests.get(url, headers)
    resHtml = response.text
    # print(resHtml)

    # with open('./html/peace_hotel.html', 'w') as f:
    #     f.write(response.text)

    html = etree.HTML(resHtml)

    result = html.xpath('//div[@class="main-bd"]/h2/a')
    for site in result:
        detail_url = site.attrib['href']
        print(detail_url)
        detail = get_comment_detail(detail_url)
        comment_list.append(detail)

    print('request_page....')
    page = html.xpath('//span[@class="next"]')
    if page and page[0].xpath('./a'):
        next_start = page[0].xpath('./a')[0].attrib['href']
    else:
        next_start = ''
    return (comment_list, next_start)


def get_comment_lists():
    comment_list = []

    url = "https://movie.douban.com/subject/26828285/reviews"
    result = request_page(url)
    comment_list.extend(result[0])
    while result[1]:
        result = request_page(url + result[1])
        comment_list.extend(result[0])

    return comment_list


def main():
    #1. 数据获取
    comment_list = get_comment_lists()
    # print(comment_list)

    #2. 数据清洗
    # 将列表中的数据转换为字符串
    comments = ''
    for k in range(len(comment_list)):
        comments = comments + (str(comment_list[k])).strip()

    # 使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)


    #3. 数据分析
    # 使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})

    # 将工作目录切换到指定目录
    print(os.getcwd())
    os.chdir('config')
    print(os.getcwd())

    # 去掉停用词
    stopwords = pd.read_csv("stop_words.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')  # quoting=3全部引用
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
    print(words_stat.head())

    # 用词云进行显示
    wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(300).values}
    print(word_frequence)

    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

        wordcloud = wordcloud.fit_words(word_frequence)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()


if __name__ == '__main__':
    main()