用chorme(切换为安卓手机)到达豆瓣英美剧页面
搜索item得到json文件位置和requests请求url
爬虫中设置自己的header(要有referer,不然爬不到)并且删除多余的url
self.url_temp = "https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288&_"
self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36",
"Referer":"https://m.douban.com/tv/american"}
代码
import requests
import json
class DoubanSpider:
def __init__(self):
self.url_temp = "https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288&_"
self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36",
"Referer":"https://m.douban.com/tv/american"}
def pares_url(self,url):
print(url)
response = requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,json_str):
dict_ret = json.loads(json_str)
content_list = dict_ret["subject_collection_items"]
return content_list
def save_content_list(self,content_list):
with open("douban.txt","a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print('保存成功')
def run(self):
num = 0
while True:
url = self.url_temp.format(num)
json_str = self.pares_url(url)
content_list = self.get_content_list(json_str)
self.save_content_list(content_list)
if len(content_list)<18:
break
num += 18
if __name__ == "__main__":
doubanspider = DoubanSpider()
doubanspider.run()