python爬取网站小说

最新推荐文章于 2024-10-28 11:00:05 发布

loveqjcd

最新推荐文章于 2024-10-28 11:00:05 发布

阅读量122

点赞数

文章标签： python 开发语言爬虫

本文链接：https://blog.csdn.net/loveqjcd/article/details/131685887

版权

最近行情不太好，亏的有点狠，研究下python爬点小说看看

说实话，python就是用来搞HS的，spider_hnovel.py：

#!/usr/bin/env python
import requests
from lxml import etree
from runjs import aes_decrypt
import os
from util_re import rm_biaodian, mg_blank

base_url = "****"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    'Cookie': '_ga_YF2C1MXZ3R=GS1.1.1688975856.1.0.1688975856.0.0.0; _ga=GA1.1.1280715718.1688975857; Hm_lvt_fc300eff92dc455e5696ee2011a5337c=1688975862; Hm_lpvt_fc300eff92dc455e5696ee2011a5337c=1688976992'
}

def download(type, start, end):
    if(end < start):
        return
    
    fpath = './spider_novel/' + type
    if not os.path.exists(fpath):
        os.makedirs(fpath)
    
    for pno in range(start, end):
        downloadOnePage(type, pno, fpath)

def downloadOnePage(type, pno, fpath):
    url = base_url + '/xiaoshuo/list-' + type + '-' + str(pno) + '.html'

    response = requests.get(url = url, headers = headers)
    # print(response.text)
    data = etree.HTML(response.text)

    hnovelList = data.xpath('//div[@class="text-list-html"]/div/ul/li');

    # print(len(hnovelList))
    hnovels = []
    for hnovel in hnovelList:
        hnovelDict = {}
        title = hnovel.xpath('a/@title')[0]
        href = hnovel.xpath('a/@href')[0]

        title = aes_decrypt(title)
        hnovelDict['title'] = mg_blank(rm_biaodian(title))
        hnovelDict['href'] = base_url + href
        hnovels.append(hnovelDict)
    print("已创建 ", type, " 第 ", pno, " 页小说链接：")

    for hnovel in hnovels:
        title = hnovel['title']
        href = hnovel['href']
        hRes = requests.get(url = href, headers = headers)
        hData = etree.HTML(hRes.text)
        print("正在下载小说：", title)
        hcontent = hData.xpath('//div[@class="dec-raw content"]/@title')[0]
        hHtml = etree.HTML(aes_decrypt(hcontent))
        content = hHtml.xpath('//p/text()')
        with open(fpath + '/' + title + '.txt', 'w', encoding='utf-8') as f:
            for line in content:
                f.write(str(line) + '\n')
        print("下载完成：", title)

if __name__ == '__main__':
    type = input("请输入下载的小说分类：")
    start = int(input("请输入下载的小说起始页："))
    end = int(input("请输入下载的小说截至页："))
    download(type, start, end)
    print("已完成下载。。。")

因为是搞HS，所以base_url就没贴上了，免得被删

说实话，这些网站的安全性真的超过了90%的网站，连数据传输都是加密的，用的是AES的cbc模式，搞不懂为啥会是64位密钥，用python整了半天整不出来，后来没辙只能把他js抠出来，然后用python调用js解密，runjs.py：

#!/usr/bin/env python
import subprocess
from functools import partial
subprocess.Popen = partial(subprocess.Popen, encoding='utf-8')

import execjs

def aes_decrypt(data):
    jsstr = get_js()
    ctx = execjs.compile(jsstr)  # 加载JS文件
    # 调用js方法  第一个参数是JS的方法名，后面的data和key是js方法的参数
    return ctx.call('Decrypt', data)  
 
 
def get_js():
    with open("./spider_novel/encrypt.js", 'r', encoding='utf-8') as f:
        res = f.read()
    return res

encrypt.js：

var CryptoJS = require("crypto-js");

const key = "SWRUSnEwSGtscHVJNm11OGlCJU9PQCF2ZF40SyZ1WFc=";
const iv = "JDB2QGtySDdWMg==";
const suffix = "883346";

//解密方法
function Decrypt(data) {
    let new_key=base64decoder(key);
    let new_iv=base64decoder(iv);
    new_iv =  CryptoJS.enc.Utf8.parse(new_iv+suffix);
    new_key = CryptoJS.enc.Utf8.parse(new_key);
	console.log("new_key = " + new_key);
	console.log("new_iv = " + new_iv);
    let decrypted = CryptoJS.AES.decrypt(data, new_key, {
        iv: new_iv,
        mode: CryptoJS.mode.CBC,
        padding: CryptoJS.pad.Pkcs7,
        formatter: CryptoJS.format.OpenSSL
    });
    return decrypted.toString(CryptoJS.enc.Utf8);
}

//加密方法
function Encrypt(data) {
    try {
        let new_key=base64decoder(key);
        let new_iv=base64decoder(iv);
        new_iv =  CryptoJS.enc.Utf8.parse(new_iv + suffix);
        new_key = CryptoJS.enc.Utf8.parse(new_key);
        let encrypted = CryptoJS.AES.encrypt(data,new_key,
            {
                iv: new_iv,
                mode: CryptoJS.mode.CBC,
                padding: CryptoJS.pad.Pkcs7,
                formatter:CryptoJS.format.OpenSSL
            });
        return encrypted.toString();

    }
    catch (e) {
        //console.log(e.message);
    }
}

function base64encoder(data) {
    let tmp = CryptoJS.enc.Utf8.parse(data); // encryptedWord Array object
    return CryptoJS.enc.Base64.stringify(tmp); // string: 'NzUzMjI1NDE='

}

function base64decoder(Context) {
    let tmp = CryptoJS.enc.Base64.parse(Context); // encryptedWord via Base64.parse()
    return CryptoJS.enc.Utf8.stringify(tmp);
}

exports.default = function () {
    Decrypt
}

然后就可以运行spider_hnovel爬小说了

后来发现这么爬有点慢，加个多线程spider_thread.py：

import threading
from spider_hnovel import download

if __name__ == '__main__':
    type = input("请输入下载的小说分类：")
    page_start = int(input("请输入下载的小说起始页："))
    page_size = int(input("请输入下载的小说总页数："))
    thread_total = int(input("请输入启动的线程数目："))

    page_end = page_start + page_size
    # 每个线程下载的页数
    if page_size % thread_total == 0 :
        thread_size = page_size // thread_total
    else :
        thread_size = page_size // thread_total + 1
        
    threads = []
    for i in range(thread_total):
        start = page_start + i * thread_size
        if i == thread_total - 1 :
            end = page_end
        else :
            end = start + thread_size
        
        # 创建线程并加入线程列表
        t = threading.Thread(target=download, args=(type, start, end))
        threads.append(t)
        # 启动线程
        t.start()
        print("i = ", i, " start = ", start, " end = ", end)

    # 等待所有线程完成
    for t in threads:
        t.join()
    print("已完成下载。。。")

运行结果如下：

这下爬取速度快多了，就是人有点累，我先去休息下