Python(爬虫) — 爬取《和平饭店》的豆瓣评论

爬虫案例

效果图:

这里写图片描述

代码实现:
#! /usr/local/bin/python3
# -*- coding: utf-8 -*-

'''
Author: elson
Desc: 电视剧《和平饭店》的豆瓣评论
'''
import re

import jieba
import os

import numpy
import pandas as pd
import matplotlib.pyplot as plt
from lxml import etree

import requests
from wordcloud import WordCloud


def get_comment_detail(url):

    response = requests.get(url)
    resHtml = response.text

    # print(resHtml)
    # with open('./html/peace_hotel_detail_comment.html', 'w') as f:
    #     f.write(response.text)

    html = etree.HTML(resHtml)
    detail = html.xpath('.//div[@class="review-content clearfix"]')[0].xpath('string(.)')
    return detail

def request_page(url):
    comment_list = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    response = requests.get(url, headers)
    resHtml = response.text
    # print(resHtml)

    # with open('./html/peace_hotel.html', 'w') as f:
    #     f.write(response.text)

    html = etree.HTML(resHtml)

    result = html.xpath('//div[@class="main-bd"]/h2/a')
    for site in result:
        detail_url = site.attrib['href']
        print(detail_url)
        detail = get_comment_detail(detail_url)
        comment_list.append(detail)

    print('request_page....')
    page = html.xpath('//span[@class="next"]')
    if page and page[0].xpath('./a'):
        next_start = page[0].xpath('./a')[0].attrib['href']
    else:
        next_start = ''
    return (comment_list, next_start)


def get_comment_lists():
    comment_list = []

    url = "https://movie.douban.com/subject/26828285/reviews"
    result = request_page(url)
    comment_list.extend(result[0])
    while result[1]:
        result = request_page(url + result[1])
        comment_list.extend(result[0])

    return comment_list


def main():
    #1. 数据获取
    comment_list = get_comment_lists()
    # print(comment_list)

    #2. 数据清洗
    # 将列表中的数据转换为字符串
    comments = ''
    for k in range(len(comment_list)):
        comments = comments + (str(comment_list[k])).strip()

    # 使用正则表达式去除标点符号
    pattern = re.compile(r'[\u4e00-\u9fa5]+')
    filterdata = re.findall(pattern, comments)
    cleaned_comments = ''.join(filterdata)


    #3. 数据分析
    # 使用结巴分词进行中文分词
    segment = jieba.lcut(cleaned_comments)
    words_df = pd.DataFrame({'segment': segment})

    # 将工作目录切换到指定目录
    print(os.getcwd())
    os.chdir('config')
    print(os.getcwd())

    # 去掉停用词
    stopwords = pd.read_csv("stop_words.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8')  # quoting=3全部引用
    words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

    # 统计词频
    words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
    words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
    print(words_stat.head())

    # 用词云进行显示
    wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
    word_frequence = {x[0]: x[1] for x in words_stat.head(300).values}
    print(word_frequence)

    word_frequence_list = []
    for key in word_frequence:
        temp = (key, word_frequence[key])
        word_frequence_list.append(temp)

        wordcloud = wordcloud.fit_words(word_frequence)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()


if __name__ == '__main__':
    main()
segment计数
饭店306
和平270
248
一个212
207

资源下载链接:停用词

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值