需要用到的库
1.requests
2.re(正则表达式库)
部分参数
请求头:
此处复制的火狐浏览器请求头
myheader = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Host": "movie.douban.com"
}
标记电影次序:time,初始化为1
排行榜第i页:link = 'https://movie.douban.com/top250?start=' + str(i * 25)
正则匹配结果:matchObj
import requests
import re
def get_movies():
#请求头
myheader = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Host": "movie.douban.com"
}
time = 1
for i in range(0, 10):
#循环访问http
link = 'https://movie.douban.com/top250?start=' + str(i * 25)
r = requests.get(link, headers=myheader, timeout=10)
#输出页面状态码
print(str(i+1), "code:", r.status_code)
#正则匹配
matchObj = re.findall(r'(?<=<span class="title">)[^&]*(?=</span>)', r.text)
#写入文件
with open("res.txt", "a", encoding='utf-8') as f:
for num in matchObj:
f.write(str(time) + ':' + num + '\n')
time += 1
#函数调用
get_movies()