爬取目的
进行re模块的测试,熟悉re模块
爬取代码
import requests
import re
'''
猫眼的反爬我算是真的见识到了,请求次数过多的话就要重新验证,这就告诉我们一定要熟悉re库的基本
使用方法!我们可以更换请求头或者带上cookie来重新请求页面数据,不过也要注意cookie的过期时间
最重要的还是要多写,这样就可以避免多次请求页面数据,提高我们的编程效率!
'''
# 火狐请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0
# 谷歌请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
def main():
request_url = 'https://maoyan.com/board/1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
'Cookie': '__mta=213629483.1586196819456.1586220633990.1586221098007.15; _lxsdk_cuid=17150b2880dc8-0721b87c08fdb6-2393f61-144000-17150b2880ec8; uuid_n_v=v1; uuid=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; _lxsdk=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; mojo-uuid=8546698e3e630d3dd44a1969d6edd2ab; __mta=213629483.1586196819456.1586196893444.1586196899032.4; _csrf=e965fc7322974b2ca300d935e179e53868282495e1b200a2950c0b271f3b2217; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1586196818,1586218772; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"fc00610630f0c80755fa3ac2d2eac323","time":1586218772414}; mojo-trace-id=8; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1586221098; _lxsdk_s=171521e0bc0-cff-1e-022%7C%7C5'}
response = requests.get(request_url, headers=headers).text
dl_pattern = re.compile(r'<dl class="board-wrapper">.*?</dl>', re.S)
dd_pattern = re.compile(r'<dd>.*?</dd>', re.S)
dd_content = dd_pattern.findall(response)
for dd in dd_content:
m_dict = {}
title = re.compile(r'title="(.*?) class="image-link"', re.S)
m_title = title.findall(dd)
actor = re.compile(r'<p class="star">(.*?)</p>', re.S)
m_actor = actor.findall(dd)
date = re.compile(r'<p class="releasetime">(.*?)</p>', re.S)
m_date = date.findall(dd)
m_dict['电影名'] = m_title
m_dict['演员'] = m_actor
m_dict['上映日期'] = m_date
movies.append(m_dict)
def display():
for m in movies:
print(m)
if __name__ == '__main__':
movies = []
main()
display()
代码测试截图
'''
未进行整理的结果如下图
'''
总结
'''多练多用,在编写代码的过程中成长,积累经验,在过程中学习,光看是学不好的。'''
补
更新后的代码 主要是更新display()
import requests
import re
# 火狐请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0
# 谷歌请求头 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
def main():
request_url = 'https://maoyan.com/board/1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
'Cookie': '__mta=213629483.1586196819456.1586220633990.1586221098007.15; _lxsdk_cuid=17150b2880dc8-0721b87c08fdb6-2393f61-144000-17150b2880ec8; uuid_n_v=v1; uuid=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; _lxsdk=562DEE10783211EA9B1571464B4C3D1FB91D18E6FC164A64A66868EACD0A713E; mojo-uuid=8546698e3e630d3dd44a1969d6edd2ab; __mta=213629483.1586196819456.1586196893444.1586196899032.4; _csrf=e965fc7322974b2ca300d935e179e53868282495e1b200a2950c0b271f3b2217; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1586196818,1586218772; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; mojo-session-id={"id":"fc00610630f0c80755fa3ac2d2eac323","time":1586218772414}; mojo-trace-id=8; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1586221098; _lxsdk_s=171521e0bc0-cff-1e-022%7C%7C5'}
response = requests.get(request_url, headers=headers).text
dl_pattern = re.compile(r'<dl class="board-wrapper">.*?</dl>', re.S)
dd_pattern = re.compile(r'<dd>.*?</dd>', re.S)
dd_content = dd_pattern.findall(response)
for dd in dd_content:
m_dict = {}
title = re.compile(r'title="(.*?) class="image-link"', re.S)
m_title = title.findall(dd)
actor = re.compile(r'<p class="star">(.*?)</p>', re.S)
m_actor = actor.findall(dd)
date = re.compile(r'<p class="releasetime">(.*?)</p>', re.S)
m_date = date.findall(dd)
m_dict['电影名'] = m_title
m_dict['演员'] = m_actor
m_dict['上映日期'] = m_date
movies.append(m_dict)
def display():
for m in movies:
print('电影名:' + str(m['电影名']))
print('演员表:' + str(m['演员']))
print('上映日期:' + str(m['上映日期']))
print('*' * 40)
if __name__ == '__main__':
movies = []
main()
display()
#其中的一部电影在猫眼上的演员表显示为空