import json
import requests
# 爬取豆瓣电影网址
class Douban(object):
def __init__(self):
self.url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%AC%A7%E7%BE%8E&sort=recommend&page_limit=20&page_start={}'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://movie.douban.com/explore',
}
self.file = open('douban.json','w')
self.page_start = 0
# print(self.headers)
def get_data(self,url):
resp = requests.get(url,headers=self.headers)
print(resp.status_code)
return resp.content.decode()
def parse_data(self,data):
resp_dict= json.loads(data)
# print(resp_dict)
result = resp_dict['subjects']
print(result)
data_list = []
for movie in result:
temp={}
temp['title']=movie['title']
temp['url'] = movie['url']
data_list.append(temp)
print(data_list)
return data_list
def save_data(self,data_list):
for data in data_list:
json_data = json.dumps(data,ensure_ascii=False) + ',\n'
# print(json_data)
self.file.write(json_data)
def __del__(self):
self.file.close()
def run(self):
while True:
url = self.url.format(self.page_start)
data = self.get_data(url)
# print(data)
data_list= self.parse_data(data)
self.save_data(data_list)
self.page_start += 20
if data_list == []:
break
if __name__ == '__main__':
douban = Douban()
douban.run()