用selenium爬页面

#coding:utf8

import selenium
from selenium import webdriver
import time
import re
import requests

theurl0 = 'http://tingshen.court.gov.cn/live/12288947'

#1.创建浏览器对象
driver0=webdriver.Firefox()
#2.请求页面
driver0.get(theurl0)
#3.等页面载入完成
time.sleep(5)#延迟
#print(driver.page_source)
thepage0 = driver0.page_source
#关闭页面
driver0.close()
#关闭浏览器
driver0.quit()

theurl0_page = theurl0.replace("http://tingshen.court.gov.cn/live/", "");
theurl0_dir = "C://ai2020//page//"+theurl0_page+".html"

with open(theurl0_dir,"w") as f:
    f.write(thepage0) 

#<iframe id="player" src="http://player.videoincloud.com/vod/7960930?src=gkw&cc=1"  allowfullscreen="true" width="100%" height="100%" frameborder="0" scrolling="no"></iframe>
rule0 = r'http://player.videoincloud.com/vod/([\s\S]*?)"'
slotList0 = re.findall(rule0, thepage0)
#print(slotList0[0])

theurl1 = "http://player.videoincloud.com/vod/"+slotList0[0]
print(theurl1)

r1 = requests.get(theurl1)
thepage1 = r1.text

#flashvars.file = encodeURIComponent("http://222.81.52.222:1126/trials/2020_year/06_month/27_day/C8A32FDA_93B0_F3DA_990A_3A43FD0100A0/69823BC3_A556_DBC0_35FD_80AC37BE1634/4513945E_75B7_1190_CEE2_F5DBBCD53BA3vod.m3u8");
rule1 = r'encodeURIComponent\(\"([\s\S]*?)\"'
slotList1 = re.findall(rule1, thepage1)
theurl2 = slotList1[0]
#http://222.81.52.222:1126/trials/2020_year/06_month/27_day/C8A32FDA_93B0_F3DA_990A_3A43FD0100A0/69823BC3_A556_DBC0_35FD_80AC37BE1634/4513945E_75B7_1190_CEE2_F5DBBCD53BA3vod.m3u8

theurl_filename = theurl2.split('/')[-1]
theurl_front = theurl2.replace(theurl_filename, "");

r2 = requests.get(theurl2)
thepage2 = r2.text
#print(thepage2)
theurl2_dir = "C://ai2020//page//"+theurl0_page+".m3u8"
with open(theurl2_dir,"w") as f:
    f.write(thepage2) 

for line3 in open(theurl2_dir):
    if ".ts" in line3:
        ts_url = theurl_front+line3
        theurl3_dir = "C://ai2020//page//"+line3
        theurl3_dir = theurl3_dir.strip()
        ts_url = ts_url.strip()         
        print(ts_url)
        r3 = requests.get(ts_url) 
        with open(theurl3_dir, "wb") as code:
            code.write(r3.content)



今天要抓一个页面,居然不能抓?

请教了大神,大神说用selenium,然后就搞定啦!

https://www.selenium.dev/

pip install selenium

Version: 3.141.0

在windows上面,用firefox的时候报错,

发现要装个这:

https://github.com/mozilla/geckodriver/releases

下载win64版,设置环境变量,就OK啦!

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值