import requests,re,json from lxml import etree from openpyxl import Workbook wb = Workbook() ws = wb.active ws.append(['电影名称']) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } base_url = 'https://movie.douban.com/chart' movies_count_url= 'https://movie.douban.com/j/chart/top_list_count?type={}&interval_id=100%3A90' movies_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start=0&limit={}' response = requests.get(url=base_url,headers=headers) html = etree.HTML(response.text) span_list = html.xpath('//div[@class="types"]/span') for span in span_list: big_title = span.xpath('./a/text()')[0] types_id = span.xpath('./a/@href')[0] type_id = re.findall('.*?type=(\d+).*?',types_id)[0] # print(big_title,type_id) resp = requests.get(url=movies_count_url.format(type_id),headers=headers) dict_data = json.loads(resp.text) movies_count = dict_data['total'] # print(movies_count) res = requests.get(url=movies_url.format(type_id,movies_count),headers=headers) movies_list = json.loads(res.text) print(big_title) for movies in movies_list: lis = [] movies_name = movies['title'] lis.append(movies_name) ws.append(lis) wb.save('./data/豆瓣电影大全.xlsx')
爬取豆瓣电影各个类型的电影名称
最新推荐文章于 2024-10-09 21:53:44 发布