python爬取猫眼电影TOP100(正则)

导入库

import pandas as pd
import re
import time
import requests

请求网页源代码

headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
         'Cookie':'__mta=142325645.1630981333372.1634619282542.1634619295877.33; _lxsdk_cuid=17bbe10210b8f-01fdc7dd16f0e1-5734174f-144000-17bbe10210cc8; uuid_n_v=v1; uuid=78E04CC0300811ECBC330D5DE33F2BD53D9D77A3A30C459E98998D72D0D88200; _lxsdk=78E04CC0300811ECBC330D5DE33F2BD53D9D77A3A30C459E98998D72D0D88200; _csrf=2189585ecfe099d1d1f33d3dfeff9e6bc61a527269fd18b31d8a18afa2b4291f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1634557351,1634557363,1634557481,1634619252; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; __mta=142325645.1630981333372.1634564791483.1634619253870.29; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1634619296; _lxsdk_s=17c9703d561-6ee-b38-802%7C%7C1',
        'Content-Type': 'text/plain; charset=UTF-8',
         'Origin':'https://maoyan.com',
         'Referer':'https://maoyan.com/board/4'}

url="https://maoyan.com/board/4?offset=0"

response = requests.get(url, headers=headers)

response.text

单页爬取 

x1=re.compile('<dd>.*?<img data-src="(.*?)".*?</a>',re.S)
photo = re.findall(x1,response.text)
print(photo)    #图片链接

x2=re.compile('<dd>.*?class="name".*?title="(.*?)".*?</a>',re.S)
name = re.findall(x2,response.text)
print(name)     #电影名称

x3=re.compile('<dd>.*?<p class="star">.*?主演:(.*?)</p>.*?</div>',re.S)
actor = re.findall(x3,response.text)
print(actor)       #主演

x4=re.compile('<dd>.*?<p class="releasetime">.*?上映时间:(.*?)</p>.*?</div>',re.S)
time = re.findall(x4,response.text)
print(time)        #时间

x5=re.compile('<dd>.*?<p class="score"><i class="integer">(.*?)</i>.*?</div>',re.S)
pingfen1 = re.findall(x5,response.text)
print(pingfen1)

x6=re.compile('<dd>.*?<p class="score">.*?<i class="fraction">(.*?)</i>.*?</div>',re.S)
pingfen2 = re.findall(x6,response.text)
print(pingfen2)

pingfen=[]
for i in range(0,len(pingfen1)):
    pingfen.append(pingfen1[i]+(pingfen2[i]))
pingfen    #评分

 多页爬取

server=[]
for i in range(10):
    url="https://maoyan.com/board/4?offset="+str(i*10)
    session = requests.Session()
    response = session.get(url, headers=headers)
    web_text=response.text
    server.append(web_text)
server
top = pd.DataFrame()
for element in server:
    x1=re.compile('<dd>.*?<img data-src="(.*?)".*?</a>',re.S)
    tupian = re.findall(x1,element)       #海报链接
    
    x2=re.compile('<dd>.*?class="name".*?title="(.*?)".*?</a>',re.S)
    name = re.findall(x2,element)         #电影名称
    
    x3=re.compile('<dd>.*?<p class="star">.*?主演:(.*?)</p>.*?</div>',re.S)
    actor = re.findall(x3,element)
   
    
    x4=re.compile('<dd>.*?<p class="releasetime">.*?上映时间:(.*?)</p>.*?</div>',re.S)
    time = re.findall(x4,element)      #上映时间
    
    x5=re.compile('<dd>.*?<p class="score"><i class="integer">(.*?)</i>.*?</div>',re.S)
    pingfen1 = re.findall(x5,element)


    x6=re.compile('<dd>.*?<p class="score">.*?<i class="fraction">(.*?)</i>.*?</div>',re.S)
    pingfen2 = re.findall(x6,element)


    pingfen=[]
    for i in range(0,len(pingfen1)):
        pingfen.append(pingfen1[i]+(pingfen2[i]))    #电影评分
        
    maoyan=pd.DataFrame({"海报链接":photo,
                             "电影名称":name,
                             "电影主演":actor,
                            "上映时间":time,
                            "电影评分":pingfen})
    top=top.append(maoyan,ignore_index=True)

 导入数据

top.to_excel("maoyan.xlsx")#导出数据
top.to_csv("maoyan.csv")#导出数据

以下是代码截图 

 

 

 

 

 

 

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值