需求:
获取猫眼电影TOP100榜中所有电影的信息(排名、电影名、演员、上映时间以及评分)
import requests,re
定义请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
分页规律:
- 第一页:https://maoyan.com/board/4
- 第二页:https://maoyan.com/board/4?offset=10
- 第三页:https://maoyan.com/board/4?offset=20
- 第n页:https://maoyan.com/board/4?offset=(n-1)*10
定义参数字典
params = {}
制定获取dd之间内容的规则
dd_pattern = re.compile(r'<dd>(.*?)</dd>', re.S)
定义匹配排名的规则
rank_pattern = re.compile(r'<i class="board-index board-index-\d+">(.*?)</i>',re.S)
或者 # rank_pattern = re.compile(r'<i .*?">(.*?)</i>',re.S)
定义匹配电影名的规则
movie_pattern = re.compile(r'<p class="name"><a .*?>(.*?)</a></p>',re.S)
定义匹配主演的规则
actor_pattern = re.compile(r'<p class="star">(.*?)</p>',re.S)
定义匹配上映时间的规则
last_time_pattern = re.compile(r'<p class="releasetime">(.*?)</p>',re.S)
定义获取评分的规则
score_pattern = re.compile(r'<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>',re.S)
for i in range(1,11):
params['offset'] = (i-1)*10
# 发起请求,接受回应
response = requests.get(url='https://maoyan.com/board/4',headers=headers,params=params)
# print(response.text)
# 提取内容
# 获取dd之间的内容
dd_list = dd_pattern.findall(response.text)
# print(dd_list)
# print(len(dd_list))
# 循环获取置顶内容
for dd in dd_list:
dic = {}
# 获取排名
rank = rank_pattern.findall(dd)[0]
# print(rank)
# 获取电影名
movie = movie_pattern.findall(dd)[0]
# print(movie)
# 获取演员
actor = actor_pattern.findall(dd)[0].strip().strip('主演:')
# print(actor)
# 获取上映时间
last_time = last_time_pattern.findall(dd)[0].strip().strip('上映时间:')
# print(last_time)
# 获取评分
score = score_pattern.findall(dd)[0]
# 获取完整评分
score_full = score[0] + score[1]
# print(score_full)
dic['排名'] = rank
dic['影片'] = movie
dic['主演'] = actor
dic['上映时间'] = last_time
dic['评分'] = score_full
with open('TOP100榜单.txt','a',encoding='utf-8') as fp:
fp.write(str(dic)+'\n')
法二:
# 导入requests、re
import requests,re
# 定义请求头:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 第一页URL:https://maoyan.com/board/4
# 第二页UPL:https://maoyan.com/board/4?offset=10
# 第三页URL:https://maoyan.com/board/4?offset=20
# 定义基础URL:
base_url = 'https://maoyan.com/board/4'
# 定义参数
params = {
}
for i in range(1,11):
params['offset'] = (i-1)*10
# 发起请求,接收响应
response = requests.get(url=base_url,params=params,headers=headers)
# print(response.text)
# 定义获取存放dd的dl的匹配规则
dl_pattern = re.compile(r'<dl class="board-wrapper">(.*?)</dl>',re.S)
dl = dl_pattern.findall(response.text)[0]
# print(dl)
# 制定获取dd的规则
dd_pattern = re.compile(r'<dd.*?>(.*?)</dd>',re.S)
dd_list = dd_pattern.findall(dl)
# print(dd_list)
# print(len(dd_list))
# 循环列表中的内容,再获取指定内容
# 制定获取排名的规则:
ranking_pattern = re.compile(r'<i.*?>(.*?)</i>',re.S)
# 制定获取片名的规则:
title_pattern = re.compile(r'<p class="name"><a.*? data-act="boarditem-click" .*?>(.*?)</a></p>')
# 制定获取主演的规则:
stars_parttern = re.compile(r'<p class="star">(.*?)</p>',re.S)
# 制定获取上映时间的规则:
time_parttern = re.compile(r'<p class="releasetime">(.*?)</p>',re.S)
# 制定获取评分的规则:
score1_parttern = re.compile(r'<i class="integer">(.*?)</i>',re.S)
score2_parttern = re.compile(r'<i class="fraction">(.*?)</i>',re.S)
for dd in dd_list:
dic = {}
# 获取排名
ranking = ranking_pattern.findall(dd)[0].strip('\n')
# print(ranking)
# 获取片名
title =title_pattern.findall(dd)[0].strip('\n')
# print(title)
# 获取主演
stars = stars_parttern.findall(dd)[0].strip('\n'+' ')
# print(stars)
# 获取上映时间
time = time_parttern.findall(dd)[0]
# print(time)
# 获取评分
score1 = score1_parttern.findall(dd)[0]
score2 = score2_parttern.findall(dd)[0]
score = score1 + score2
# print(score)
dic['排名'] = ranking
dic['片名'] = title
dic['主演'] = stars
dic['上映时间'] = time
dic['评分'] = score
# 保存数据
with open('TOP100.txt','a',encoding='utf-8') as fp:
fp.write(str(dic)+'\n')