废话不多说,直接上代码
import requests
from bs4 import BeautifulSoup
import csv
import time
def crawl_douban_movie_top_250():
base_url = 'https://movie.douban.com/top250?start='
headers = { # 用户代理(User-Agent)头部信息,模拟真实的浏览器请求
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
movie_list = []
for i in range(10):
url = base_url + str(i * 25)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
movie_items = soup.select('.item')
for movie_item in movie_items:
title = movie_item.select_one('.title').text.strip()
info = movie_item.select_one('.bd p').text.strip()
rating = movie_item.select_one('.rating_num').text.strip()
rating_count = movie_item.select_one('.star span:last-child').text.strip('人评价')
quote = movie_item.select_one('.inq')
if quote:
quote = quote.text.strip()
else:
quote = ''
movie_list.append([title, info, rating, rating_count, quote])
print(f'成功爬取电影:{title}')
# 添加延时等待,规避频繁请求被限制
time.sleep(2)
save_to_csv(movie_list)
def save_to_csv(movie_list):
with open('douban_movie_top_250.csv', 'w', encoding='utf-8-sig', newline='') as file:
writer = csv.writer(file)
writer.writerow(['电影名称', '简介', '评分', '评价人数', '引言'])
writer.writerows(movie_list)
if __name__ == '__main__':
crawl_douban_movie_top_250()