网络爬虫(二)

爬取海王电影20180101-20190101影评

导包

import json
import time
from datetime import datetime, timedelta
import requests

发送请求,获取响应

# 发送请求,获取响应
def get_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # print(response.encoding)
        return response.text
    return None

解析响应

# 解析响应
def parse_html(html):
    data = json.loads(html)['cmts']  # 将str转化json
    comments = []
    for item in data:
        comment = {
            'id': item['id'],
            'cityName': item['cityName'] if 'cityName' in item else ' ',
            'content': item['content'].replace('\n', ' '),
            'score': item['score'],
            'startTime': item['startTime']
        }
        comments.append(comment)
    return comments

主函数

# 主函数
def main(id, start_time, end_time):
    while start_time > end_time:
        url = 'http://m.maoyan.com/mmdb/comments/movie/' + str(id) + '.json?' \
                                                                     '_v=yes&offset=0&startTime=' + start_time.replace(
            ' ', '%20')
        # print(url)
        html = None
        # 发送请求,获取响应
        try:
            html = get_data(url)
            print(html)
        except Exception:
            time.sleep(0.5)
        else:
            time.sleep(0.1)

        # 解析响应
        comments = parse_html(html)
        # print(comments)
        # print(len(comments))
        start_time = comments[14]['startTime']
        start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + timedelta(seconds=-1)
        start_time = datetime.strftime(start_time, '%Y-%m-%d %H:%M:%S')  # 转化为字符串

        # 保存数据
        for item in comments:
            with open('海王影评2018.txt', 'a', encoding='utf-8') as f:
                f.write(str(item['id']) + '\t' + item['cityName'] + '\t' + item['content'] + '\t' + str(
                    item['score']) + '\t' + item['startTime'] + '\n')

程序入口

if __name__ == '__main__':
    # url = http://m.maoyan.com/mmdb/comments/movie/249342.json?_v=yes&offset=0&startTime=2019-01-01%2000:00:00
    id = 249342
    start_time = '2019-01-01 00:00:00'
    end_time = '2018-01-01 00:00:00'
    main(id, start_time, end_time)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值