还是那句话,python就是用来搞HS的。
前几天爬取了小说之后,总是意犹未尽,毕竟那网站上那么多资源,也许指不定哪天就被封了,多可惜,所以决定多爬些电影下来。
先说下网站的结构,是进去之后首页有分类,电影小说其他的,电影下面又分成八大菜系。我也想把电影都下载下来,可惜没有那么大的硬盘空间。所以决定,把电影的图片、标题,下载链接给保存到excel中去,也许哪天就能用的到呐。
直接上代码:
import requests
import os
from lxml import etree
import sys
sys.path.append("d:\\Python Project\\")
from spider_novel.runjs import aes_decrypt
from spider_novel.util_re import mg_blank, rm_biaodian
import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import AnchorMarker, TwoCellAnchor
from openpyxl.styles import Alignment
import shutil
import threading
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': '_ga_YF2C1MXZ3R=GS1.1.1688975856.1.0.1688975856.0.0.0; _ga=GA1.1.1280715718.1688975857; Hm_lvt_fc300eff92dc455e5696ee2011a5337c=1688975862; Hm_lpvt_fc300eff92dc455e5696ee2011a5337c=1688976992'
}
base_url = '****'
file_path = './spider_torrent/torrent/torrent.xlsx'
bak_path = './spider_torrent/torrent/torrent_bak.xlsx'
demo_img_path = './spider_torrent/torrent/'
def download(type, start, end):
if(end < start):
return
for pno in range(start, end):
print("分类:", type, " 开始下载第 ", pno, " 页视频链接")
shipins = downloadOnePage(type, pno)
print("分类: ", type, " 第 ", pno, " 页视频链接下载完成, 开始下载图片")
downloadImgs(shipins)
print("分类: ", type, " 第 ", pno, " 页图片下载完成, 开始写入Excel")
wirteToSheet(type, shipins, pno)
print("分类: ", type, " 第 ", pno, " 页视频链接写入完成")
# 创建excel
def createExcel(file_path, type):
wb = openpyxl.Workbook()
# 删除默认生成的sheet
wb.remove(wb["Sheet"])
# 初次生成excel时需要创建表单才能保存
sheet = wb.create_sheet(type, index = 1)
# 添加表头
sheet.cell(row=1, column=1, value='图片') # 设置第一行的字段
sheet.cell(row=1, column=2, value='标题')
sheet.cell(row=1, column=3, value='下载地址1')
sheet.cell(row=1, column=4, value='下载地址2')
# 设置宽高
sheet.column_dimensions['A'].width = 15
sheet.column_dimensions['B'].width = 35
sheet.column_dimensions['C'].width = 50
sheet.column_dimensions['D'].width = 50
wb.save(file_path)
#excel如果没有表单则创建对应表单
def checkExcelSheet(type):
wb = openpyxl.load_workbook(file_path)
sheets = wb.sheetnames
if type not in sheets:
# 创建表单
sheet = wb.create_sheet(type, index = len(sheets) + 1)
# 添加表头
sheet.cell(row=1, column=1, value='图片') # 设置第一行的字段
sheet.cell(row=1, column=2, value='标题')
sheet.cell(row=1, column=3, value='下载地址1')
sheet.cell(row=1, column=4, value='下载地址2')
# 设置宽高
sheet.column_dimensions['A'].width = 15
sheet.column_dimensions['B'].width = 35
sheet.column_dimensions['C'].width = 50
sheet.column_dimensions['D'].width = 50
wb.save(file_path)
def downloadOnePage(type, pno):
url = base_url + '/shipin/list-' + type + '-' + str(pno) + '.html'
response = requests.get(url = url, headers = headers)
data = etree.HTML(response.text)
shipinList = data.xpath('//div[@id="tpl-img-content"]/li');
shipins = []
i = 1
for shipin in shipinList:
shipinDict = {}
title = shipin.xpath('a/h3/@title')[0]
href = shipin.xpath('a/@href')[0]
img = shipin.xpath('a/img/@data-original')[0]
title = aes_decrypt(title)
hRes = requests.get(url = base_url + href, headers = headers)
hData = etree.HTML(hRes.text)
print("正在获取第 ", pno, " 页第 ", i, " 条视频链接: ", title)
torrent1 = hData.xpath('//input[@class="form-control input-sm copy_btn"]/@value')[0]
torrent2 = hData.xpath('//input[@class="form-control input-sm copy_btn app_disable"]/@value')[0]
shipinDict['title'] = mg_blank(rm_biaodian(title))
shipinDict['img'] = img
shipinDict['torrent1'] = torrent1
shipinDict['torrent2'] = torrent2
shipinDict['imgId'] = str(pno) + '_' + str(i)
shipins.append(shipinDict)
i += 1
return shipins
def downloadImgs(shipins):
for shipin in shipins:
img = shipin['img']
imgId = shipin['imgId']
print("图片下载: ", img, " => ", imgId)
response = requests.get(img)
imgcontent = response.content
img_path = demo_img_path + '/image' + imgId + '.jpg'
with open(img_path, 'wb') as f:
f.write(imgcontent)
# print("图片下载完成, sleep 3秒")
# time.sleep(3)
lock = threading.Lock()
def wirteToSheet(type, shipins, pno):
lock.acquire()
print("第 ", pno, " 页加锁 thread_name = ", threading.current_thread().name)
print("复制备份torrent文件...")
bakExcel()
wb = openpyxl.load_workbook(file_path)
sheet = wb[type]
# 设置居中模式
align = Alignment(horizontal='left', vertical='center')
#获取对应sheet总行数,在后面追加
i = sheet.max_row
for shipin in shipins:
title = shipin['title']
torrent1 = shipin['torrent1']
torrent2 = shipin['torrent2']
imgId = shipin['imgId']
# 设置行高
sheet.row_dimensions[i + 1].height = 100
img_path = demo_img_path + '/image' + imgId + '.jpg'
# 创建openpyxl的Image对象
img = Image(img_path)
_from = AnchorMarker(0, 50000, i, 50000) # 创建锚标记对象,设置图片所占的row
to = AnchorMarker(1, -50000, i + 1, -50000) # 创建锚标记对象,设置图片所占的row 从而确认了图片位置
img.anchor = TwoCellAnchor('twoCell', _from, to) # 将锚标记对象设置图片对象的锚属性,图形就具备了所在位置
sheet.add_image(img) # 添加图片
sheet.cell(row = i + 1, column = 2, value=title).alignment = align
sheet.cell(row = i + 1, column = 3, value=torrent1).alignment = align
sheet.cell(row = i + 1, column = 4, value=torrent2).alignment = align
i += 1
wb.save(file_path)
wb.close()
print("删除第 ", pno, " 页临时图片文件")
for shipin in shipins:
imgId = shipin['imgId']
img_path = demo_img_path + '/image' + imgId + '.jpg'
os.remove(img_path)
print("第 ", pno, " 页释放 thread_name = ", threading.current_thread().name)
lock.release()
def bakExcel():
shutil.copy(file_path, bak_path)
if __name__ == '__main__':
type = input("请输入下载的视频分类: ")
page_start = int(input("请输入下载的视频起始页: "))
page_size = int(input("请输入下载的视频总页数: "))
thread_total = int(input("请输入启动的线程数目: "))
# torrent文件夹不存在则创建文件夹
dir_name = os.path.dirname(file_path)
if not os.path.isdir(dir_name):
os.makedirs(dir_name)
# torrent.xlsx文件不存在则创建文件
if not os.path.isfile(file_path):
createExcel(file_path, type)
checkExcelSheet(type)
page_end = page_start + page_size
# 每个线程下载的页数
if page_size % thread_total == 0 :
thread_size = page_size // thread_total
else :
thread_size = page_size // thread_total + 1
threads = []
for i in range(thread_total):
start = page_start + i * thread_size
if i == thread_total - 1 :
end = page_end
else :
end = start + thread_size
# 创建线程并加入线程列表
t = threading.Thread(target=download, args=(type, start, end))
threads.append(t)
# 启动线程
t.start()
print("i = ", i, " start = ", start, " end = ", end)
# 等待所有线程完成
for t in threads:
t.join()
print("已完成下载...")
还是一样的,base_url 属于敏感词汇,*号代替。
虽说python的多线程是假的,但是我同时爬取不同页面的时候多线程是实打实的有效的呀。
多线程爬取跟上次不同的是,这次是需要并发操作写入同一个excel,所以才用了加锁处理(不知道为啥把lock当作参数传递会报错,所以直接声明在了外面)。
写入torrent.excel之前先做了个备份处理,毕竟万一写入过程中发生了个啥错误,那岂不是前功尽弃了。
效果图: