聚焦爬虫案例2:猫眼电影TOP100

需求:
获取猫眼电影TOP100榜中所有电影的信息(排名、电影名、演员、上映时间以及评分)

import requests,re

定义请求头

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

分页规律:

  1. 第一页:https://maoyan.com/board/4
  2. 第二页:https://maoyan.com/board/4?offset=10
  3. 第三页:https://maoyan.com/board/4?offset=20
  4. 第n页:https://maoyan.com/board/4?offset=(n-1)*10

定义参数字典

params = {}

制定获取dd之间内容的规则

dd_pattern = re.compile(r'<dd>(.*?)</dd>', re.S)

定义匹配排名的规则

rank_pattern = re.compile(r'<i class="board-index board-index-\d+">(.*?)</i>',re.S)
或者 # rank_pattern = re.compile(r'<i .*?">(.*?)</i>',re.S)

定义匹配电影名的规则

movie_pattern = re.compile(r'<p class="name"><a .*?>(.*?)</a></p>',re.S)

定义匹配主演的规则

actor_pattern = re.compile(r'<p class="star">(.*?)</p>',re.S)

定义匹配上映时间的规则

last_time_pattern  = re.compile(r'<p class="releasetime">(.*?)</p>',re.S)

定义获取评分的规则

score_pattern = re.compile(r'<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>',re.S)

for i in range(1,11):
    params['offset'] = (i-1)*10
    # 发起请求,接受回应

 response = requests.get(url='https://maoyan.com/board/4',headers=headers,params=params)
    # print(response.text)

    # 提取内容

    # 获取dd之间的内容
    dd_list = dd_pattern.findall(response.text)
    # print(dd_list)
    # print(len(dd_list))
# 循环获取置顶内容
for dd in dd_list:
    dic = {}
# 获取排名
    rank = rank_pattern.findall(dd)[0]
    # print(rank)
# 获取电影名
    movie = movie_pattern.findall(dd)[0]
    # print(movie)
# 获取演员
    actor = actor_pattern.findall(dd)[0].strip().strip('主演:')
    # print(actor)
# 获取上映时间
    last_time = last_time_pattern.findall(dd)[0].strip().strip('上映时间:')
    # print(last_time)
# 获取评分
    score = score_pattern.findall(dd)[0]
# 获取完整评分
    score_full = score[0] + score[1]
    # print(score_full)
    dic['排名'] = rank
    dic['影片'] = movie
    dic['主演'] = actor
    dic['上映时间'] = last_time
    dic['评分'] = score_full
    with open('TOP100榜单.txt','a',encoding='utf-8') as fp:
        fp.write(str(dic)+'\n')

法二:

# 导入requests、re
import requests,re

# 定义请求头:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

# 第一页URL:https://maoyan.com/board/4
# 第二页UPL:https://maoyan.com/board/4?offset=10
# 第三页URL:https://maoyan.com/board/4?offset=20
# 定义基础URL:
base_url = 'https://maoyan.com/board/4'

# 定义参数
params = {

}

for i in range(1,11):
    params['offset'] = (i-1)*10

# 发起请求,接收响应
    response = requests.get(url=base_url,params=params,headers=headers)
    # print(response.text)


    # 定义获取存放dd的dl的匹配规则
    dl_pattern = re.compile(r'<dl class="board-wrapper">(.*?)</dl>',re.S)
    dl = dl_pattern.findall(response.text)[0]
    # print(dl)

    # 制定获取dd的规则
    dd_pattern = re.compile(r'<dd.*?>(.*?)</dd>',re.S)
    dd_list = dd_pattern.findall(dl)
    # print(dd_list)
    # print(len(dd_list))

    # 循环列表中的内容,再获取指定内容
    # 制定获取排名的规则:
    ranking_pattern = re.compile(r'<i.*?>(.*?)</i>',re.S)
    # 制定获取片名的规则:
    title_pattern = re.compile(r'<p class="name"><a.*? data-act="boarditem-click" .*?>(.*?)</a></p>')
    # 制定获取主演的规则:
    stars_parttern = re.compile(r'<p class="star">(.*?)</p>',re.S)
    # 制定获取上映时间的规则:
    time_parttern = re.compile(r'<p class="releasetime">(.*?)</p>',re.S)
    # 制定获取评分的规则:
    score1_parttern = re.compile(r'<i class="integer">(.*?)</i>',re.S)
    score2_parttern = re.compile(r'<i class="fraction">(.*?)</i>',re.S)


    for dd in dd_list:
        dic = {}
    # 获取排名
        ranking = ranking_pattern.findall(dd)[0].strip('\n')
        # print(ranking)
    # 获取片名
        title =title_pattern.findall(dd)[0].strip('\n')
        # print(title)
    # 获取主演
        stars = stars_parttern.findall(dd)[0].strip('\n'+' ')
        # print(stars)
    # 获取上映时间
        time = time_parttern.findall(dd)[0]
        # print(time)
    # 获取评分
        score1 = score1_parttern.findall(dd)[0]
        score2 = score2_parttern.findall(dd)[0]
        score = score1 + score2
        # print(score)
        dic['排名'] = ranking
        dic['片名'] = title
        dic['主演'] = stars
        dic['上映时间'] = time
        dic['评分'] = score
    # 保存数据
        with open('TOP100.txt','a',encoding='utf-8') as fp:
            fp.write(str(dic)+'\n')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值