最近行情不太好,亏的有点狠,研究下python爬点小说看看
说实话,python就是用来搞HS的,spider_hnovel.py:
#!/usr/bin/env python
import requests
from lxml import etree
from runjs import aes_decrypt
import os
from util_re import rm_biaodian, mg_blank
base_url = "****"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Cookie': '_ga_YF2C1MXZ3R=GS1.1.1688975856.1.0.1688975856.0.0.0; _ga=GA1.1.1280715718.1688975857; Hm_lvt_fc300eff92dc455e5696ee2011a5337c=1688975862; Hm_lpvt_fc300eff92dc455e5696ee2011a5337c=1688976992'
}
def download(type, start, end):
if(end < start):
return
fpath = './spider_novel/' + type
if not os.path.exists(fpath):
os.makedirs(fpath)
for pno in range(start, end):
downloadOnePage(type, pno, fpath)
def downloadOnePage(type, pno, fpath):
url = base_url + '/xiaoshuo/list-' + type + '-' + str(pno) + '.html'
response = requests.get(url = url, headers = headers)
# print(response.text)
data = etree.HTML(response.text)
hnovelList = data.xpath('//div[@class="text-list-html"]/div/ul/li');
# print(len(hnovelList))
hnovels = []
for hnovel in hnovelList:
hnovelDict = {}
title = hnovel.xpath('a/@title')[0]
href = hnovel.xpath('a/@href')[0]
title = aes_decrypt(title)
hnovelDict['title'] = mg_blank(rm_biaodian(title))
hnovelDict['href'] = base_url + href
hnovels.append(hnovelDict)
print("已创建 ", type, " 第 ", pno, " 页小说链接:")
for hnovel in hnovels:
title = hnovel['title']
href = hnovel['href']
hRes = requests.get(url = href, headers = headers)
hData = etree.HTML(hRes.text)
print("正在下载小说:", title)
hcontent = hData.xpath('//div[@class="dec-raw content"]/@title')[0]
hHtml = etree.HTML(aes_decrypt(hcontent))
content = hHtml.xpath('//p/text()')
with open(fpath + '/' + title + '.txt', 'w', encoding='utf-8') as f:
for line in content:
f.write(str(line) + '\n')
print("下载完成:", title)
if __name__ == '__main__':
type = input("请输入下载的小说分类:")
start = int(input("请输入下载的小说起始页:"))
end = int(input("请输入下载的小说截至页:"))
download(type, start, end)
print("已完成下载。。。")
因为是搞HS,所以base_url就没贴上了,免得被删
说实话,这些网站的安全性真的超过了90%的网站,连数据传输都是加密的,用的是AES的cbc模式,搞不懂为啥会是64位密钥,用python整了半天整不出来,后来没辙只能把他js抠出来,然后用python调用js解密,runjs.py:
#!/usr/bin/env python
import subprocess
from functools import partial
subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')
import execjs
def aes_decrypt(data):
jsstr = get_js()
ctx = execjs.compile(jsstr) # 加载JS文件
# 调用js方法 第一个参数是JS的方法名,后面的data和key是js方法的参数
return ctx.call('Decrypt', data)
def get_js():
with open("./spider_novel/encrypt.js", 'r', encoding='utf-8') as f:
res = f.read()
return res
encrypt.js:
var CryptoJS = require("crypto-js");
const key = "SWRUSnEwSGtscHVJNm11OGlCJU9PQCF2ZF40SyZ1WFc=";
const iv = "JDB2QGtySDdWMg==";
const suffix = "883346";
//解密方法
function Decrypt(data) {
let new_key=base64decoder(key);
let new_iv=base64decoder(iv);
new_iv = CryptoJS.enc.Utf8.parse(new_iv+suffix);
new_key = CryptoJS.enc.Utf8.parse(new_key);
console.log("new_key = " + new_key);
console.log("new_iv = " + new_iv);
let decrypted = CryptoJS.AES.decrypt(data, new_key, {
iv: new_iv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7,
formatter: CryptoJS.format.OpenSSL
});
return decrypted.toString(CryptoJS.enc.Utf8);
}
//加密方法
function Encrypt(data) {
try {
let new_key=base64decoder(key);
let new_iv=base64decoder(iv);
new_iv = CryptoJS.enc.Utf8.parse(new_iv + suffix);
new_key = CryptoJS.enc.Utf8.parse(new_key);
let encrypted = CryptoJS.AES.encrypt(data,new_key,
{
iv: new_iv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7,
formatter:CryptoJS.format.OpenSSL
});
return encrypted.toString();
}
catch (e) {
//console.log(e.message);
}
}
function base64encoder(data) {
let tmp = CryptoJS.enc.Utf8.parse(data); // encryptedWord Array object
return CryptoJS.enc.Base64.stringify(tmp); // string: 'NzUzMjI1NDE='
}
function base64decoder(Context) {
let tmp = CryptoJS.enc.Base64.parse(Context); // encryptedWord via Base64.parse()
return CryptoJS.enc.Utf8.stringify(tmp);
}
exports.default = function () {
Decrypt
}
然后就可以运行spider_hnovel爬小说了
后来发现这么爬有点慢,加个多线程spider_thread.py:
import threading
from spider_hnovel import download
if __name__ == '__main__':
type = input("请输入下载的小说分类:")
page_start = int(input("请输入下载的小说起始页:"))
page_size = int(input("请输入下载的小说总页数:"))
thread_total = int(input("请输入启动的线程数目:"))
page_end = page_start + page_size
# 每个线程下载的页数
if page_size % thread_total == 0 :
thread_size = page_size // thread_total
else :
thread_size = page_size // thread_total + 1
threads = []
for i in range(thread_total):
start = page_start + i * thread_size
if i == thread_total - 1 :
end = page_end
else :
end = start + thread_size
# 创建线程并加入线程列表
t = threading.Thread(target=download, args=(type, start, end))
threads.append(t)
# 启动线程
t.start()
print("i = ", i, " start = ", start, " end = ", end)
# 等待所有线程完成
for t in threads:
t.join()
print("已完成下载。。。")
运行结果如下:
这下爬取速度快多了,就是人有点累,我先去休息下