python爬取网站视频下载链接

loveqjcd

已于 2023-07-15 15:33:19 修改

阅读量266

点赞数

文章标签： python 开发语言爬虫

于 2023-07-15 15:05:51 首次发布

本文链接：https://blog.csdn.net/loveqjcd/article/details/131739335

版权

该代码示例展示了如何使用Python进行网络爬虫，从特定网站抓取电影的图片、标题和下载链接，并存储到Excel文件中。通过多线程处理不同页面，提高了爬取效率。同时，为了防止写入Excel时出错导致数据丢失，程序在写入前会备份原始Excel文件。

摘要由CSDN通过智能技术生成

还是那句话，python就是用来搞HS的。

前几天爬取了小说之后，总是意犹未尽，毕竟那网站上那么多资源，也许指不定哪天就被封了，多可惜，所以决定多爬些电影下来。

先说下网站的结构，是进去之后首页有分类，电影小说其他的，电影下面又分成八大菜系。我也想把电影都下载下来，可惜没有那么大的硬盘空间。所以决定，把电影的图片、标题，下载链接给保存到excel中去，也许哪天就能用的到呐。

直接上代码：

import requests
import os
from lxml import etree
import sys
sys.path.append("d:\\Python Project\\")
from spider_novel.runjs import aes_decrypt
from spider_novel.util_re import mg_blank, rm_biaodian
import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import AnchorMarker, TwoCellAnchor
from openpyxl.styles import Alignment
import shutil
import threading

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    'Cookie': '_ga_YF2C1MXZ3R=GS1.1.1688975856.1.0.1688975856.0.0.0; _ga=GA1.1.1280715718.1688975857; Hm_lvt_fc300eff92dc455e5696ee2011a5337c=1688975862; Hm_lpvt_fc300eff92dc455e5696ee2011a5337c=1688976992'
}
base_url = '****'

file_path = './spider_torrent/torrent/torrent.xlsx'
bak_path = './spider_torrent/torrent/torrent_bak.xlsx'
demo_img_path = './spider_torrent/torrent/'

def download(type, start, end):
    if(end < start):
        return
    for pno in range(start, end):
        print("分类：", type, " 开始下载第 ", pno, " 页视频链接")
        shipins = downloadOnePage(type, pno)
        print("分类: ", type, " 第 ", pno, " 页视频链接下载完成, 开始下载图片")
        downloadImgs(shipins)
        print("分类: ", type, " 第 ", pno, " 页图片下载完成, 开始写入Excel")
        wirteToSheet(type, shipins, pno)
        print("分类: ", type, " 第 ", pno, " 页视频链接写入完成")

# 创建excel
def createExcel(file_path, type):
    wb = openpyxl.Workbook()
    # 删除默认生成的sheet
    wb.remove(wb["Sheet"])
    # 初次生成excel时需要创建表单才能保存
    sheet = wb.create_sheet(type, index = 1)
    # 添加表头
    sheet.cell(row=1, column=1, value='图片')  # 设置第一行的字段
    sheet.cell(row=1, column=2, value='标题')
    sheet.cell(row=1, column=3, value='下载地址1')
    sheet.cell(row=1, column=4, value='下载地址2')
    # 设置宽高
    sheet.column_dimensions['A'].width = 15
    sheet.column_dimensions['B'].width = 35
    sheet.column_dimensions['C'].width = 50
    sheet.column_dimensions['D'].width = 50
    wb.save(file_path)

#excel如果没有表单则创建对应表单
def checkExcelSheet(type):
    wb = openpyxl.load_workbook(file_path)
    sheets = wb.sheetnames
    if type not in sheets:
        # 创建表单
        sheet = wb.create_sheet(type, index = len(sheets) + 1)
        # 添加表头
        sheet.cell(row=1, column=1, value='图片')  # 设置第一行的字段
        sheet.cell(row=1, column=2, value='标题')
        sheet.cell(row=1, column=3, value='下载地址1')
        sheet.cell(row=1, column=4, value='下载地址2')
        # 设置宽高
        sheet.column_dimensions['A'].width = 15
        sheet.column_dimensions['B'].width = 35
        sheet.column_dimensions['C'].width = 50
        sheet.column_dimensions['D'].width = 50
        wb.save(file_path)

def downloadOnePage(type, pno):
    url = base_url + '/shipin/list-' + type + '-' + str(pno) + '.html'

    response = requests.get(url = url, headers = headers)
    data = etree.HTML(response.text)

    shipinList = data.xpath('//div[@id="tpl-img-content"]/li');
    shipins = []
    i = 1
    for shipin in shipinList:
        shipinDict = {}
        title = shipin.xpath('a/h3/@title')[0]
        href = shipin.xpath('a/@href')[0]
        img = shipin.xpath('a/img/@data-original')[0]
        title = aes_decrypt(title)

        hRes = requests.get(url = base_url + href, headers = headers)
        hData = etree.HTML(hRes.text)
        print("正在获取第 ", pno, " 页第 ", i, " 条视频链接: ", title)
        torrent1 = hData.xpath('//input[@class="form-control input-sm copy_btn"]/@value')[0]
        torrent2 = hData.xpath('//input[@class="form-control input-sm copy_btn app_disable"]/@value')[0]

        shipinDict['title'] = mg_blank(rm_biaodian(title))
        shipinDict['img'] = img
        shipinDict['torrent1'] = torrent1
        shipinDict['torrent2'] = torrent2
        shipinDict['imgId'] = str(pno) + '_' + str(i)
        shipins.append(shipinDict)
        i += 1
    return shipins

def downloadImgs(shipins):
    for shipin in shipins:
        img = shipin['img']
        imgId = shipin['imgId']
        print("图片下载: ", img, " => ", imgId)
        response = requests.get(img)
        imgcontent = response.content
        img_path = demo_img_path + '/image' + imgId + '.jpg'
        with open(img_path, 'wb') as f:
            f.write(imgcontent)
    # print("图片下载完成, sleep 3秒")
    # time.sleep(3)

lock = threading.Lock()

def wirteToSheet(type, shipins, pno):
    lock.acquire()
    print("第 ", pno, " 页加锁 thread_name = ", threading.current_thread().name)
    print("复制备份torrent文件...")
    bakExcel()
    wb = openpyxl.load_workbook(file_path)
    sheet = wb[type]
    # 设置居中模式
    align = Alignment(horizontal='left', vertical='center')
    #获取对应sheet总行数，在后面追加
    i = sheet.max_row
    for shipin in shipins:
        title = shipin['title']
        torrent1 = shipin['torrent1']
        torrent2 = shipin['torrent2']
        imgId = shipin['imgId']

        # 设置行高
        sheet.row_dimensions[i + 1].height = 100
        img_path = demo_img_path + '/image' + imgId + '.jpg'
        # 创建openpyxl的Image对象
        img = Image(img_path)
        _from = AnchorMarker(0, 50000, i, 50000)  # 创建锚标记对象,设置图片所占的row
        to = AnchorMarker(1, -50000, i + 1, -50000)  # 创建锚标记对象,设置图片所占的row 从而确认了图片位置
        img.anchor = TwoCellAnchor('twoCell', _from, to)  # 将锚标记对象设置图片对象的锚属性,图形就具备了所在位置
        sheet.add_image(img)  # 添加图片
        sheet.cell(row = i + 1, column = 2, value=title).alignment = align
        sheet.cell(row = i + 1, column = 3, value=torrent1).alignment = align
        sheet.cell(row = i + 1, column = 4, value=torrent2).alignment = align
        i += 1
    wb.save(file_path)
    wb.close()
    print("删除第 ", pno, " 页临时图片文件")
    for shipin in shipins:
        imgId = shipin['imgId']
        img_path = demo_img_path + '/image' + imgId + '.jpg'
        os.remove(img_path)
    print("第 ", pno, " 页释放 thread_name = ", threading.current_thread().name)
    lock.release()

def bakExcel():
    shutil.copy(file_path, bak_path)

if __name__ == '__main__':
    type = input("请输入下载的视频分类: ")
    page_start = int(input("请输入下载的视频起始页: "))
    page_size = int(input("请输入下载的视频总页数: "))
    thread_total = int(input("请输入启动的线程数目: "))

    # torrent文件夹不存在则创建文件夹
    dir_name = os.path.dirname(file_path)
    if not os.path.isdir(dir_name):
        os.makedirs(dir_name)
    # torrent.xlsx文件不存在则创建文件
    if not os.path.isfile(file_path):
        createExcel(file_path, type)
    checkExcelSheet(type)

    page_end = page_start + page_size
    # 每个线程下载的页数
    if page_size % thread_total == 0 :
        thread_size = page_size // thread_total
    else :
        thread_size = page_size // thread_total + 1
        
    threads = []
    for i in range(thread_total):
        start = page_start + i * thread_size
        if i == thread_total - 1 :
            end = page_end
        else :
            end = start + thread_size

        # 创建线程并加入线程列表
        t = threading.Thread(target=download, args=(type, start, end))
        threads.append(t)
        # 启动线程
        t.start()
        print("i = ", i, " start = ", start, " end = ", end)

    # 等待所有线程完成
    for t in threads:
        t.join()
    print("已完成下载...")

还是一样的，base_url 属于敏感词汇，*号代替。

虽说python的多线程是假的，但是我同时爬取不同页面的时候多线程是实打实的有效的呀。

多线程爬取跟上次不同的是，这次是需要并发操作写入同一个excel，所以才用了加锁处理（不知道为啥把lock当作参数传递会报错，所以直接声明在了外面）。

写入torrent.excel之前先做了个备份处理，毕竟万一写入过程中发生了个啥错误，那岂不是前功尽弃了。

效果图：