爬虫总结

爬虫就是获取网页然后从网页中提取需要的信息

获取网页的方式,urllib,request等

提取网页的中信息的方式,beautifulsoup,xpath等

其中可能遇到的包括但不限于

  • 请求被被拒绝(403)。把正常访问的浏览器的header带上,伪装成正常浏览器,有时需要带refer。
  • 登录才能访问页面。将账号密码信息带上,或者将登陆后的cookie或者session带上。
  • 无法查看js动态生成的信息。selenium+浏览器内核进行访问,但是效率会低一些
  • 被断开连接。减少并发连接数,两次访问之间添加延时。
  • element和获取的页面代码不一致。浏览器中抓包,查看document文件。
  • 代码太多,找不到数据位置(非静态)。查看script代码块,查看window对象中的成员。
  • 页面中没有视频地址。抓包,然后根据size排序,size一直增长的然后较大的一般就是视屏的访问地址。

另外需要注意的事项

  • 老视频可能没有分片然后是flv等格式,新视频可能采用了自定义的格式然后进行了分片,所以需要对不同的类型进行测试,排除特殊的影响。

建议

  • 多媒体的内容下载后使用ffmpeg或者moviepy进行后处理,转为mp4。同时将分片的合并,方便使用。
  • 建议使用xpath,因为找定位到元素后谷歌浏览器中有copy xpath选项。

 

下面为B站爬虫代码

 

爬取番剧

# coding=utf-8
import requests
from xml.dom.minidom import parseString
from lxml import etree
import json
import threading
from queue import Queue
import os
import sys
from urllib import parse
import time

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Referer": "https://www.bilibili.com"
            }

keyword='魔笛MAGI'

response =  requests.get(
    "https://search.bilibili.com/all?keyword="+parse.quote(keyword),
     headers=headers,
    verify=False)

html = etree.HTML(response.text)

playinfo = html.xpath("/html/body/script[1]/text()")[0]
js=playinfo[playinfo.find('{'):playinfo.find(';(function()')]

jso=json.loads(js)

bangumi_list=jso["flow"]["getMixinFlowList-jump-keyword-"+keyword]["result"][3]["data"][0]["eps"]

season_list=jso["flow"]["getMixinFlowList-jump-keyword-"+keyword]["result"][3]["data"]

bangumi_list=[]

for season in season_list:
    for e in season['eps']:
        e['org_title']=season['org_title']
        e['title'] = season['title']
        bangumi_list.append(e)


refer=bangumi_list[0]['url']

urls = []
url_queue = Queue()

for i in bangumi_list:
    ep_url=i['url']

    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
        "Origin": "https://www.bilibili.com",

        "Referer": ep_url
                }


    response =  requests.get( ep_url, headers=headers, verify=False)


    print(response.status_code)

    time.sleep(0.1)

    xml_str=response.content
    html = etree.HTML(response.text)
    playinfo = html.xpath("/html/body/script[4]/text()")[0]

    js=playinfo[playinfo.find('{'):]

    try:
        jso=json.loads(js)
    except Exception as e:
        print(i)
        print(js)
        print(e)
        sys.exit()

    for j in jso['data']['durl']:

        ep_dict=dict()
        ep_dict['index_title']=i['index_title']
        ep_dict['long_title']=i['long_title']
        ep_dict['org_title']=i['org_title']
        ep_dict['title'] = i['title']
        ep_dict['title'] = ep_dict['title'].replace('<em class="keyword">','').replace('</em>','')

        ep_dict['url']=j['url']

        urls.append(ep_dict)
        url_queue.put(ep_dict)


temp_queue = Queue(maxsize=30)


def download(e):

    global temp_queue

    response = requests.get(e['url'], headers=headers, verify=False)
    print(response.status_code)

    path=os.path.join(os.getcwd(), e['title'])

    if not os.path.exists(path):
        os.makedirs(path)

    url=e['url']
    t_id = url[url.rfind('/') + 1: url.find('?')]

    with open(e['title']+'/'+t_id+'-'+e['index_title']+'-'+e['long_title']+".flv", "wb") as f:
        f.write(response.content)

    temp_queue.get()
    temp_queue.task_done()


for i in range(len(urls)):
    urll=url_queue.get()
    temp_queue.put(urll)
    t_url = threading.Thread(target=download,args=(urll,))
    t_url.start()
    url_queue.task_done()

temp_queue.join()
url_queue.join()
print('finish download')


爬取教学视频

# coding=utf-8
import requests
from lxml import etree
import json
import threading
from queue import Queue
import os
import sys
import time

# 访问视频页面,获取每集的地址

page_url = 'https://www.bilibili.com/video/av29600072'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Referer": "https://www.bilibili.com"
}

response = requests.get(
    page_url,
    headers=headers,
    verify=False)

html = etree.HTML(response.text)

playinfo = html.xpath("/html/head/script[4]/text()")[0]
playinfo = html.xpath("/html/head/script[4]/text()")[0]

js = playinfo[playinfo.find('{'):playinfo.find(';(function()')]

jso = json.loads(js)

bangumi_list = jso['videoData']['pages']

for i in bangumi_list:
    i['title'] = jso['videoData']['title']

# 访问每集的地址,获取每集的分片视频地址

urls = []
url_queue = Queue()

for i in bangumi_list:

    ep_url = page_url + '?p=' + str(i['page'])

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
        "Origin": "https://www.bilibili.com",
        "Referer": "https://www.bilibili.com",
    }

    response = requests.get(ep_url, headers=headers, verify=False)

    print(response.status_code)

    time.sleep(0.1)

    # print(response.text)
    html = etree.HTML(response.text)
    playinfo = html.xpath("/html/head/script[3]/text()")[0]
    js = playinfo[playinfo.find('{'):]

    try:
        jso = json.loads(js)
    except Exception as e:
        print(i)
        print(js)
        print(e)
        sys.exit()

    if 'dash' in jso['data']:

        try:
            j = jso['data']['dash']
        except Exception as e:
            print(i['page'])
            print(jso['data'])
            sys.exit()

        ep_dict = dict()

        ep_dict['flv_m4s'] = 'm4s'
        ep_dict['index_title'] = str(i['page'])
        ep_dict['long_title'] = i['part']
        ep_dict['title'] = i['title']

        ep_dict['video_url'] = j['video'][0]['baseUrl']
        ep_dict['audio_url'] = j['audio'][0]['baseUrl']

        urls.append(ep_dict)
        url_queue.put(ep_dict)

    if 'durl' in jso['data']:

        for j in jso['data']['durl']:
            ep_dict = dict()

            ep_dict['flv_m4s'] = 'flv'
            ep_dict['index_title'] = str(i['page'])
            ep_dict['long_title'] = i['part']
            ep_dict['title'] = i['title']

            ep_dict['url'] = j['url']

            urls.append(ep_dict)
            url_queue.put(ep_dict)

temp_queue = Queue(maxsize=30)

def download(e):

    global temp_queue

    try:
        if e['flv_m4s'] == 'm4s':
            response = requests.get(e['video_url'], headers=headers, verify=False)
            # print(response.status_code)
            path = os.path.join(os.getcwd(), e['title'])

            if not os.path.exists(path):
                os.makedirs(path)

            url = e['video_url']
            t_id = url[url.rfind('/') + 1: url.find('?')]
            with open(e['title'] + '/' + t_id + '-' + '(video)-' + e['index_title'] + '-' + e['long_title'] + ".m4s",
                      "wb") as f:
                f.write(response.content)

            response = requests.get(e['audio_url'], headers=headers, verify=False)

            url = e['audio_url']
            t_id = url[url.rfind('/') + 1: url.find('?')]
            with open(e['title'] + '/' + t_id + '-' + '(audio)-' + e['index_title'] + '-' + e['long_title'] + ".m4s",
                      "wb") as f:
                f.write(response.content)

            temp_queue.get()
            temp_queue.task_done()

        if e['flv_m4s'] == 'flv':
            response = requests.get(e['url'], headers=headers, verify=False)

            path = os.path.join(os.getcwd(), e['title'])

            if not os.path.exists(path):
                os.makedirs(path)

            url = e['url']
            t_id = url[url.rfind('/') + 1: url.find('?')]

            with open(e['title'] + '/' + t_id + '-' + e['index_title'] + '-' + e['long_title'] + ".flv", "wb") as f:
                f.write(response.content)

            temp_queue.get()
            temp_queue.task_done()

    except Exception as eee:
        print(eee)
        print(e)


for i in range(len(urls)):
    urll = url_queue.get()
    temp_queue.put(urll)
    t_url = threading.Thread(target=download, args=(urll,))
    t_url.start()
    url_queue.task_done()

temp_queue.join()
url_queue.join()
print('finish download')

视频转码,分片合并,音视频混流

# coding=utf-8

# combine the video clip
from moviepy.editor import *
import os

def combine_flv(path):

    cwd=os.getcwd()
    os.chdir(path)
    L = []
    flvlist = [i for i in os.listdir(path) if i[-3:] == 'flv']

    num = []
    for i in flvlist:
        num.append(i.split('-')[0])

    num = set(num)

    # b = 0
    for i in num:
        temp = []
        temp_name=[j for j in flvlist if j.split('-')[0] == i]
        temp = [VideoFileClip(j) for j in flvlist if j.split('-')[0] == i]
        final_clip = concatenate_videoclips(temp)

        if not os.path.exists('output'):
            os.makedirs('output')

        final_clip.write_videofile('output/'+'-'.join(temp_name[0].split('-')[3:]).replace('flv','mp4'))

    os.chdir(cwd)

# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI')
# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI 第二季')
# combine_flv(r'E:\presentation\flexible_control\spider_practice\魔笛MAGI 辛巴达的冒险')

def combine_m4s_flv(path):
    cwd = os.getcwd()
    os.chdir(path)

    flvlist = [i for i in os.listdir(path) if i[-3:] == 'fl0']
    num = []
    for i in flvlist:
        num.append(i.split('-')[0])
    num = set(num)
    for i in num:
        temp = []
        temp_name = [j for j in flvlist if j.split('-')[0] == i]
        temp = [VideoFileClip(j) for j in flvlist if j.split('-')[0] == i]
        final_clip = concatenate_videoclips(temp)
        if not os.path.exists('output'):
            os.makedirs('output')
        final_clip.write_videofile('output/' + '-'.join(temp_name[0].split('-')[3:]).replace('flv', 'mp4'))

    os.chdir(cwd)

    m4s_list = [i for i in os.listdir(path) if i[-3:] == 'm4s']
    num = []
    for i in m4s_list:
        num.append(i.split('-')[0])
    num = set(num)
    for i in num:
        temp = []
        temp_name = [j for j in m4s_list if j.split('-')[0] == i]
        if not os.path.exists('output'):
            os.makedirs('output')

        cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(temp_name[0], temp_name[1],'output/' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4'))
        cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(temp_name[0], temp_name[1],'output/' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4'))

        audio = temp_name[0]
        video = temp_name[1]
        outputvideo = 'output'+'\\' + '-'.join(temp_name[0].split('-')[4:]).replace('m4s', 'mp4')

        audio = path + '\\' + audio
        video = path + '\\' + video
        outputvideo = path + '\\' + outputvideo
        cmd = 'ffmpeg -i "{}" -i "{}" -c:v copy -strict experimental "{}"'.format(video, audio, outputvideo)

        os.system(cmd)

    os.chdir(cwd)

combine_m4s_flv(r'E:\presentation\flexible_control\spider_practice\OpenCV基础课程')

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值