猫眼电影

爬取目的

进行re模块的测试,熟悉re模块

爬取代码

import requests
import re

'''
猫眼的反爬我算是真的见识到了,请求次数过多的话就要重新验证,这就告诉我们一定要熟悉re库的基本
使用方法!我们可以更换请求头或者带上cookie来重新请求页面数据,不过也要注意cookie的过期时间
最重要的还是要多写,这样就可以避免多次请求页面数据,提高我们的编程效率!

'''

# 火狐请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0
# 谷歌请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36


def main():
    request_url = 'https://maoyan.com/board/1'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
        'Cookie': '__mta=213629483.1586196819456.1586220633990.1586221098007.15; _lxsdk_cuid=17150b2880dc8-0721b87c08fdb6-2393f61-144000-17150b2880ec8; uuid_n_v=v1; uuid=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; _lxsdk=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; mojo-uuid=8546698e3e630d3dd44a1969d6edd2ab; __mta=213629483.1586196819456.1586196893444.1586196899032.4; _csrf=e965fc7322974b2ca300d935e179e53868282495e1b200a2950c0b271f3b2217; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1586196818,1586218772; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"fc00610630f0c80755fa3ac2d2eac323","time":1586218772414}; mojo-trace-id=8; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1586221098; _lxsdk_s=171521e0bc0-cff-1e-022%7C%7C5'}
    response = requests.get(request_url, headers=headers).text
    dl_pattern = re.compile(r'<dl class="board-wrapper">.*?</dl>', re.S)
    dd_pattern = re.compile(r'<dd>.*?</dd>', re.S)
    dd_content = dd_pattern.findall(response)

    for dd in dd_content:
        m_dict = {}
        title = re.compile(r'title="(.*?) class="image-link"', re.S)
        m_title = title.findall(dd)
        actor = re.compile(r'<p class="star">(.*?)</p>', re.S)
        m_actor = actor.findall(dd)
        date = re.compile(r'<p class="releasetime">(.*?)</p>', re.S)
        m_date = date.findall(dd)
        m_dict['电影名'] = m_title
        m_dict['演员'] = m_actor
        m_dict['上映日期'] = m_date
        movies.append(m_dict)


def display():
    for m in movies:
        print(m)


if __name__ == '__main__':
    movies = []
    main()
    display()

代码测试截图

'''
未进行整理的结果如下图
'''

在这里插入图片描述

总结

'''多练多用,在编写代码的过程中成长,积累经验,在过程中学习,光看是学不好的。'''
更新后的代码 主要是更新display()
import requests
import re


# 火狐请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0
# 谷歌请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36


def main():
    request_url = 'https://maoyan.com/board/1'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
        'Cookie': '__mta=213629483.1586196819456.1586220633990.1586221098007.15; _lxsdk_cuid=17150b2880dc8-0721b87c08fdb6-2393f61-144000-17150b2880ec8; uuid_n_v=v1; uuid=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; _lxsdk=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; mojo-uuid=8546698e3e630d3dd44a1969d6edd2ab; __mta=213629483.1586196819456.1586196893444.1586196899032.4; _csrf=e965fc7322974b2ca300d935e179e53868282495e1b200a2950c0b271f3b2217; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1586196818,1586218772; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"fc00610630f0c80755fa3ac2d2eac323","time":1586218772414}; mojo-trace-id=8; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1586221098; _lxsdk_s=171521e0bc0-cff-1e-022%7C%7C5'}
    response = requests.get(request_url, headers=headers).text
    dl_pattern = re.compile(r'<dl class="board-wrapper">.*?</dl>', re.S)
    dd_pattern = re.compile(r'<dd>.*?</dd>', re.S)
    dd_content = dd_pattern.findall(response)

    for dd in dd_content:
        m_dict = {}
        title = re.compile(r'title="(.*?) class="image-link"', re.S)
        m_title = title.findall(dd)
        actor = re.compile(r'<p class="star">(.*?)</p>', re.S)
        m_actor = actor.findall(dd)
        date = re.compile(r'<p class="releasetime">(.*?)</p>', re.S)
        m_date = date.findall(dd)
        m_dict['电影名'] = m_title
        m_dict['演员'] = m_actor
        m_dict['上映日期'] = m_date
        movies.append(m_dict)


def display():
    for m in movies:
        print('电影名:' + str(m['电影名']))
        print('演员表:' + str(m['演员']))
        print('上映日期:' + str(m['上映日期']))
        print('*' * 40)


if __name__ == '__main__':
    movies = []
    main()
    display()

在这里插入图片描述

#其中的一部电影在猫眼上的演员表显示为空
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值