python爬取电影信息

一、分析网页

1、url分析

进入目标网站,随机点击几页,发现url并没有发生改变,所以判断它是Ajax加载的数据。

2、抓包

分别拿到第1、2、10页电影的链接,发现下面的不同之处:在这里插入图片描述
取第一页的链接,设置代理,发现之前的方法不行,被识别出了:在这里插入图片描述
所以这次用另一种方法来设置代理。

import requests,csv,time, re
from lxml import etree

headers = {
    "Cookie":"_userCode_=20204191919289838; _userIdentity_=20204191919289688; userId=0; defaultCity=%25E5%258C%2597%25E4%25BA%25AC%257C290; _ydclearance=c3b9f4b5ce5dceb4c9cf7b4f-f207-47e8-963f-001d10447dd5-1587302351; _tt_=3766C4565B2918A857DEB17E9B43D23D; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587295170; __utma=221034756.1308725070.1587295172.1587295172.1587295172.1; __utmc=221034756; __utmz=221034756.1587295172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587295831; __utmb=221034756.4.10.1587295172",
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
url = "http://movie.mtime.com/boxoffice/?year=2018&area=china&type=MovieRankingYear&category=all&page=0&display=list&timestamp=1587296897132&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json"
requests.get(url = url, headers = headers)

在这里插入图片描述
设置代理就成功了。再给requests赋值并输出:在这里插入图片描述
把运行结果复制到json解析器进行解析,发现只用"HTML"里的内容:在这里插入图片描述
用json得到以下格式后,,解析网页,就可以提取我们需要的信息了。

# 解析网页
from lxml import etree #导入解析库
html_etree = etree.HTML(reponse.json()["html"]) # 看成一个筛子,树状

二、提取信息

# 提取信息
dd = html_etree.xpath('//div[@class="boxofficelist"]/div/dd')
for item in dd:
    rank = item.xpath('./div/div[1]/i/text()')
    director = item.xpath('./div/div[2]//p[3]/a/text()') #模糊匹配
    actor = item.xpath('./div/div[2]//p[4]/a/text()')
    act = "、".join(actor) #拼接字符串
    act = [act]
    score = item.xpath('./div/div[3]/p[2]/text()')
    score = score[0].replace("人评分", "") #替换掉“人评分”
    score = [score]
    day = item.xpath('./div/div[2]/p[1]/strong[1]/text()') 
    
    name = item.xpath('./div/div[2]/h3/a/text()')
    
    print (rank+name+director+act+score)

运行结果:在这里插入图片描述

三、写入并保存到csv

2018内地票房第一页:

import requests,csv,time, re
from lxml import etree

headers = {
    "Cookie":"_userCode_=20204191919289838; _userIdentity_=20204191919289688; userId=0; defaultCity=%25E5%258C%2597%25E4%25BA%25AC%257C290; _tt_=3766C4565B2918A857DEB17E9B43D23D; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587295170; __utmc=221034756; __utmz=221034756.1587295172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587300489; __utma=221034756.1308725070.1587295172.1587295172.1587300490.2; _ydclearance=c6b8d144c079ea65b4d266b0-b63f-4f06-b009-95db29fc8005-1587310834",
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
url = "http://movie.mtime.com/boxoffice/?year=2018&area=china&type=MovieRankingYear&category=all&page=0&display=list&timestamp=1587296897132&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json"
#创建文件夹并打开
fp = open ( "./电影信息2018.csv" ,  'a' ,  newline = '' ,  encoding  =  'utf-8-sig' ) 
writer = csv.writer ( fp )  #我要写入
writer.writerow (( '排名' ,  '名称' ,  '导演' ,  '演员' ,  '评分' ))
html_etree = etree.HTML(response.json()["html"]) # 看成一个筛子,树状

response = requests.get(url = url, headers = headers)
dd = html_etree.xpath('//div[@class="boxofficelist"]/div/dd')
for item in dd:
    rank = item.xpath('./div/div[1]/i/text()')
    director = item.xpath('./div/div[2]//p[3]/a/text()') #模糊匹配
    actor = item.xpath('./div/div[2]//p[4]/a/text()')
    act = "、".join(actor) #拼接字符串
    act = [act]
    score = item.xpath('./div/div[3]/p[2]/text()')
    score = score[0].replace("人评分", "") #替换掉“人评分”
    score = [score]
    day = item.xpath('./div/div[2]/p[1]/strong[1]/text()') 
    
    name = item.xpath('./div/div[2]/h3/a/text()')
    
    result = (rank+name+director+act+score)
    writer.writerow(result)
fp.close()

运行结果:在这里插入图片描述

五、代码汇总

2020内地票房:

import requests,csv,time, re
from lxml import etree

startTime = time.time() #记录起始时间

headers = {
    "Cookie":"_userCode_=20204191919289838; _userIdentity_=20204191919289688; userId=0; defaultCity=%25E5%258C%2597%25E4%25BA%25AC%257C290; _tt_=3766C4565B2918A857DEB17E9B43D23D; Hm_lvt_6dd1e3b818c756974fb222f0eae5512e=1587295170; __utmc=221034756; __utmz=221034756.1587295172.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Hm_lpvt_6dd1e3b818c756974fb222f0eae5512e=1587300489; __utma=221034756.1308725070.1587295172.1587295172.1587300490.2; _ydclearance=c6b8d144c079ea65b4d266b0-b63f-4f06-b009-95db29fc8005-1587310834",
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}

#创建文件夹并打开
fp = open ( "./电影信息2020.csv" ,  'a' ,  newline = '' ,  encoding  =  'utf-8-sig' ) 
writer = csv.writer ( fp )  #我要写入
writer.writerow (( '排名' ,  '名称' ,  '导演' ,  '演员' ,  '评分' ))

for page in range(10):
    url = "http://movie.mtime.com/boxoffice/?year=2020&area=china&type=MovieRankingYear&category=all&page=%s&display=list&timestamp=1586629225983&version=07bb781100018dd58eafc3b35d42686804c6df8d&dataType=json"%page
    print ("正在爬取第%s页..."%(page+1))
    html_etree = etree.HTML(response.json()["html"]) # 看成一个筛子,树状

    response = requests.get(url = url, headers = headers)
    dd = html_etree.xpath('//div[@class="boxofficelist"]/div/dd')
    for item in dd:
        try: 
            rank = item.xpath ( './div/div[1]/ i/text()' ) 
            try: 
                director = item.xpath ( './div/div[2]//p[3]/a/text()' )  #模糊匹配
            except: 
                pass
            rank = item.xpath('./div/div[1]/i/text()')
            director = item.xpath('./div/div[2]//p[3]/a/text()') #模糊匹配
            actor = item.xpath('./div/div[2]//p[4]/a/text()')
            act = "、".join(actor) #拼接字符串
            act = [act]
            score = item.xpath('./div/div[3]/p[2]/text()')
            score = score[0].replace("人评分", "") #替换掉“人评分”
            score = [score]
            day = item.xpath('./div/div[2]/p[1]/strong[1]/text()') 

            name = item.xpath('./div/div[2]/h3/a/text()')

            result = (rank+name+director+act+score)
            print(result)
            writer.writerow(result)
        except:           
            print  ( "这里报错了" ) 
            break 
fp . close () 
endTime =time.time()#获取结束时的时间
useTime =(endTime-startTime)
print ("该次所获的信息一共使用%s秒钟"%useTime)

jupyter运行结果:在这里插入图片描述
csv截屏:在这里插入图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值