豆瓣数据爬取
爬取韩剧、英剧…的豆瓣影视资料
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 27 10:26:36 2020
@author: Administrator
"""
import requests
import json
import os
os.chdir(r"H:\01\spyder")
class Douban:
def __init__(self,tag_list):
self.url_temp = "https://movie.douban.com/j/search_subjects?type=movie&tag={}&sort=recommend&page_limit=20&page_start={}"
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
self.tag_list = tag_list
def parse_url(self,url):
response = requests.get(url,headers = self.headers)
return response.content.decode(encoding = "utf-8")
def get_content_list(self,json_str):
dict_ret = json.loads(json_str)
content_list = dict_ret["subjects"]
return content_list
def save_content(self,content_list,tag,num):
filepath = "douban{}_{}.txt".format(tag,num)
with open (filepath,"a",encoding = "utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
def run(self):
for tag in self.tag_list:
num = 0
while True:
#1.准备第一页的url地址
url_temp = self.url_temp.format(tag,num)
print(tag)
#2.发送请求,获取响应
json_str = self.parse_url(url_temp)
#3.提取数据
content_list = self.get_content_list(json_str)
#4 保存,一页页的数据保存,不混在一起保存
self.save_content(content_list,tag,num//20)
#4。输出url地址
print(url_temp)
#5.获取下一页地址
if len(content_list)<20:break
else:num+=20
if __name__ == "__main__":
douban = Douban(["韩剧","英剧"])
douban.run()
-----需要改进的地方:一个ip豆瓣只能爬取18页的内容,即360条不到的数据
—— 欢迎大家一起讨论
---------------------视频学习 小酥仙儿 2020.2.27-----------------------------------