本文章目的是爬取豆瓣电影top250
导入所需的包
import requests
import re
import csv
from requests import RequestException
发出请求,解析响应
def get_url(url):
try:
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url, header)
if response.status_code == 200:
# print(response.text)
return response.text
return None
except RequestException:
return None
解析响应
def parse_html(html):
prog = re.compile(r'<li>.*?<img width="100".*?src="(.*?)".*?</a>' # 封面
+ '.*?title">(.*?)</span>' # 电影名
+ '.*?</a>.*?; (.*?)<br>' # 主演
+ '(.*?) .*?</p>' # 上映时间
+ '.*?average">(.*?)</span>' # 评分
+ '.*?</li>', re.S)
# 返回一个列表,列表的每个元素是一个元组
items = re.findall(prog, html)
content = []
for item in items:
content.append({
'image': item[0],
'name': item[1],
'actor': item[2],
'date': item[3],
'score': item[4]
})
return content
保存数据
def save_data(content):
for item in content:
with open('top_250.csv', 'a', encoding='utf-8') as f:
write = csv.writer(f)
write.writerow([item['image'], item['name'], item['actor'], item['date'], item['score']])
程序入口
if __name__ == '__main__':
header = ['封面', '电影名', '主演', '上映时间', '评分']
with open('top_250.csv', 'w', encoding='utf-8') as f:
write = csv.writer(f)
write.writerow(header)
for i in range(10):
url = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
# 发出请求,获取响应
html = get_url(url)
# 解析响应
content = parse_html(html)
# 保存数据
save_data(content)