爬虫requests-html框架练手项目(二)

11 篇文章 1 订阅
4 篇文章 1 订阅

爬虫requests-html框架练手项目(二)

爬虫框架:Requests-html
功能:爬取豆瓣中斗罗大陆1动漫的短评

# -*- coding: utf-8 -*-
# @Author  : KongDeXing
# @Time    : 2022/1/10 16:06
# @Function: Data_Analysis

import random
from pandas import DataFrame
from requests_html import HTMLSession

def get_html(url):
    session = HTMLSession()
    proxies = {
        'http': 'http://{}'.format('8.129.28.247:8888'),
        'https': 'https://{}'.format('8.129.28.247:8888'),
    }
    users = {# 可以在发送请求的时候更换user-agent
                    1 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0',
                    2 : 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                    3 : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                    4 : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
                    5 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
                    6 : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
            }
    ur1 = random.sample(users.keys(),1)
    headers = "users"+str(ur1)
    html = session.get(url, headers={'user-agent': headers})
    return html

def save_to_csv(dict,page_name):
    df = DataFrame(dict)
    # 保存 dataframe
    df.to_csv(page_name, index=False, encoding="utf-8-sig")
    print("数据保存到"+page_name+"成功,编码格式为‘utf-8-sig’")

if __name__ == '__main__':
    Number_of_responses_list = []  # 点赞数量
    author_list = []  # 作者
    state_list = []  # 状态(是否看过)
    time_list = []  # 发布时间
    discuss_list = []  # 评论内容
    recommend_list = []  # 推荐程度
    article_1 = 0
    article_2 = 0
    article_3 = 0
    while True:
        url = 'https://movie.douban.com/subject/27040807/comments?start='+str(article_1)+'&limit=20&status=P&sort=new_score'
        html = get_html(url)
        print("网页状态", html)
        if str(html)[-5:-2] == "403":
            print("403")
            break
        else:
            print(str(html)[-5:-2])
            list1 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[4]/div')
            # print(len(list1))#输出列表字符长度
            for i in list1[:20]:
                content = i.text
                content_list = content.split("\n")
                # print(content_list)
                discuss = content_list[1]
                info = content_list[0].split(" ")
                Number_of_responses = info[0]                      #点赞数
                #info_list[1]为‘有用’,对其无用,故省略
                author = info[2]                                   #发布者昵称
                state = info[3]                                    #状态(是否看过)
                time = info[4] + " " + info[5]                #发布时间
                Number_of_responses_list.append(Number_of_responses)
                author_list.append(author)
                state_list.append(state)
                time_list.append(time)
                discuss_list.append(discuss)
            list2 = html.html.xpath(r"/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/h3/span/span[2]")
            # print(len(list2))
            count_list = ["力荐","推荐","还行","较差","很差"]
            for j in list2:
                content = str(j)[-4:-2]
                if content not in count_list:
                    recommend = "较差"#有网页可知为打分的评论均偏向差评,故定义为较差
                    recommend_list.append(recommend)
                else:
                    recommend_list.append(content)
                # print(content)
            article_1 +=20

    while True:
        url = 'https://movie.douban.com/subject/27040807/comments?start=' + str(article_2) + '&limit=20&status=N&sort=new_score'
        html = get_html(url)
        print("网页状态", html)
        if str(html)[-5:-2] == "403":
            print("403")
            break
        else:
            list3 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[3]/div')
            for k in list3[:20]:
                content = k.text
                content_list = content.split("\n")
                # print(content_list)
                discuss = content_list[1]
                info = content_list[0].split(" ")
                Number_of_responses = info[0]  # 点赞数
                # info_list[1]为‘有用’,对其无用,故省略
                author = info[2]  # 发布者昵称
                state = "在看"  # 状态(是否看过)
                time = info[3] + " " + info[4]  # 发布时间
                Number_of_responses_list.append(Number_of_responses)
                author_list.append(author)
                state_list.append(state)
                time_list.append(time)
                discuss_list.append(discuss)
            list4 = html.html.xpath(r"/html/body/div[3]/div[1]/div/div[1]/div[3]/div/div[2]/h3/span[2]/span[1]")
            # print(len(list4))
            count_list = ["力荐", "推荐", "还行", "较差", "很差"]
            for l in list4:
                content = str(l)[-4:-2]
                if content not in count_list:
                    recommend = "较差"  # 有网页可知为打分的评论均偏向差评,故定义为较差
                    recommend_list.append(recommend)
                else:
                    recommend_list.append(content)
                # print(content)
            article_2 += 20

    while True:
        url = 'https://movie.douban.com/subject/27040807/comments?start=' + str(article_3) + '&limit=20&status=F&sort=new_score'
        html = get_html(url)
        print("网页状态", html)
        if str(html)[-5:-2] == "403":
            print("403")
            break
        else:
            list5 = html.html.xpath(r'/html/body/div[3]/div[1]/div/div[1]/div[3]/div')
            # print(len(list5))
            for m in list5[:20]:
                content = m.text
                content_list = content.split("\n")
                # print(content_list)
                discuss = content_list[1]
                info = content_list[0].split(" ")
                Number_of_responses = info[0]  # 点赞数
                # info_list[1]为‘有用’,对其无用,故省略
                author = info[2]  # 发布者昵称
                try:
                    state = info[3]  # 状态(是否看过)
                    time = info[4] + " " + info[5]  # 发布时间
                except Exception as e:
                    state = "想看"
                    time = info[3] + " " + info[4]
                recommend = "想看"
                Number_of_responses_list.append(Number_of_responses)
                author_list.append(author)
                state_list.append(state)
                time_list.append(time)
                discuss_list.append(discuss)
                recommend_list.append(recommend)
            article_3 += 20
    print("数据获取成功!!")
    dict = {'评论者': author_list, "是否看过": state_list,'评论内容': discuss_list,
                '有用投票': Number_of_responses_list, "评分": recommend_list, '时间': time_list}
    save_to_csv(dict,r"斗罗大陆1 豆瓣短评.csv")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

TensorTinker

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值