import requests #爬虫请求包
import re #正则表达式的包
import csv #导入csv包,写入数据
#请求头,模拟浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"
}
#提前加载正则表达式,效率更高
obj = re.compile(r'<li>.*?<span class="title">(?P<title>.*?)</span>'
r'.*?导演:(?P<actor>.*?) '
r'.*?<br>(?P<year>.*?) '
r'.*?<span class="rating_num" property="v:average">(?P<star>.*?)</span>'
r'.*?<span>(?P<num>.*?)人评价</span>',re.S)
#创建文件,newline消除空白行,encoding很重要
f = open("top250.csv",mode = "w",newline = "", encoding = 'utf-8-sig')
csvwriter = csv.writer(f)
#由于一页只能加载25个电影信息,利用循环获取250个电影信息
i = 0
for i in range(10):
url = f"https://movie.douban.com/top250?start={i}&filter="
resp = requests.get(url,headers = headers)
resptext = resp.text #获取网页text信息
result = obj.finditer(resptext) #正则匹配我们想要的信息
#result是一个迭代器,利用循环读取每个电影的信息,并写入csv文件
for it in result:
dic = it.groupdict() #it转换为字典格式
dic['year'] = dic['year'].strip() #消除对应值的空白部分
csvwriter.writerow(dic.values()) #每个电影信息写入csv文件
f.close() #关闭csv文件
print("over!")
03-31
1544
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)
06-05
633
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)