python爬取豆瓣电影榜单
python爬取豆瓣电影榜单并保存到本地excel中,以后就不愁没片看了。
目标
确定我们想要抓取的电影的相关内容。
- 抓取豆瓣top250电影的排名、电影名、评价(总结很到位)、评分、点评人数及电影的豆瓣页面。
- 抓取各种电影类型的排行榜前100。
编码
省略需求到编码中间的繁文缛节,直接上手编码。(此处是最终编码)
目标一使用BeautifulSoup解析页面查找元素。
目标二调用接口处理返回的json数据。
import requests
import openpyxl
import json
from bs4 import BeautifulSoup
from openpyxl.styles import Color, Font, Alignment
class DouBanMovieList1():
def __init__(self):
self.path = r'D:\Download\豆瓣电影榜单\豆瓣电影.xlsx'
def get_moviedata(self):
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
for i in range(10):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i*25)
response = requests.get(url=url, headers=headers)
bs = BeautifulSoup(response.text, 'lxml')
ranks = bs.select('em')
titles = bs.find_all('div', class_='hd')
evaluations = []
for j in range(1, 26):
if bs.select_one('#content > div > div.article > ol > li:nth-child(%d) > div > div.info > div.bd > p.quote > span'%(j)):
evaluations.append(bs.select_one('#content > div > div.article > ol > li:nth-child(%d) > div > div.info > div.bd > p.quote > span'%(j)).get_text())
else:
evaluations.append('')
ratings = bs.find_all('span', class_='rating_num')
evaluation_numbers = bs.find_all('div', class_='star')
links = bs.select('ol li div a')
for rank, title, evaluation, rating, evaluation_number, link in zip(ranks, titles, evaluations, ratings, evaluation_numbers, links):
data.append([rank.get_text(), title.get_text().split('\n')[2], evaluation, rating.get_text().strip(), evaluation_number.get_text().split('\n')[4].strip('人评价'), link.get('href')]