# 案例分析:
# 抓取豆瓣电影top250整个网页requests# 提取电影名称、评分、年份、多少人评价re# 扩展:文件写入csv中 import requests import re import csv
# 步骤一:抓取网页
url = "https://movie.douban.com/top250" headers = {
"User-Agent":"Mozilla/5.0(Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.289 Safari/537.36"
requ = requests.get(url,headers=headers)# print(requ.text)# 步骤二 提取数据
page_content = requ.text# 解析数据
obj= re.compile(r'<li>.*?<span
class="title">(?P<name>.*?)</span>.*?<br>(?P<year>.*?) .*?'
r'<span class="rating_num" property="v:average">' r'(?P<score>.*?)</span>.*?'
r'<span>(?P<num>.*?)人评价</span>',re.S)
result = obj.finditer(page_content)# 扩展:写入文件中
f = open("data.csv",mode="w") csvwriter = csv.writer(f) for it in result:
# print(it.group("name"))
# print(it.group("year").strip())# print(it.group("score"))# print(it.group("num")) dic = it.groupdict()
dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() requ.close() print("over!")