参考2021年最新Python爬虫教程+实战项目案例(最新录制)_哔哩哔哩_bilibili
抓取豆瓣电影Top250相关信息练习
import re
import requests
1. #获取网页
url = "https://movie.douban.com/top250"
head = {
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Mobile Safari/537.36 Edg/95.0.1020.53"
}
resp = requests.get(url,headers=head)
html = resp.text
# print(html)
# 2. 解析数据
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<title>.*?)</span>.*?'
r'<div class="bd">.*?<p class="">(?P<director>.*?)<br>(?P<year>.*?)'
r' / (?P<country>.*?) / (?P<type>.*?)</p>.*?'
r'<span>(?P<person>.*?)人评价</span>.*?<span class="inq">(?P<inq>.*?)</span>'
,re.S
)
result = obj.finditer(html)
for i in result:
print(i.group("title"))
print(i.group("director").strip())
print(i.group("year").strip())
print(i.group("country"))
print(i.group("type").strip())
print(i.group("person")+"人评价")
print(i.group("inq"))
print("")