python-关于爬虫爬取豆瓣电影网址

import json

import requests
# 爬取豆瓣电影网址
class Douban(object):
    def __init__(self):
        self.url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%AC%A7%E7%BE%8E&sort=recommend&page_limit=20&page_start={}'
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Referer': 'https://movie.douban.com/explore',

        }
        self.file = open('douban.json','w')
        self.page_start = 0
        # print(self.headers)

    def get_data(self,url):
        resp = requests.get(url,headers=self.headers)
        print(resp.status_code)
        return resp.content.decode()
    def parse_data(self,data):
        resp_dict= json.loads(data)
        # print(resp_dict)
        result = resp_dict['subjects']
        print(result)
        data_list = []
        for movie in result:
            temp={}
            temp['title']=movie['title']
            temp['url'] = movie['url']
            data_list.append(temp)
        print(data_list)
        return data_list

    def save_data(self,data_list):
        for data in data_list:
            json_data = json.dumps(data,ensure_ascii=False) + ',\n'
            # print(json_data)
            self.file.write(json_data)

    def __del__(self):
        self.file.close()

    def run(self):
        while True:
            url = self.url.format(self.page_start)
            data = self.get_data(url)
            # print(data)
            data_list= self.parse_data(data)
            self.save_data(data_list)
            self.page_start += 20
            if data_list == []:
                break

if __name__ == '__main__':
    douban = Douban()
    douban.run()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值