异步采集某阁小说平台，并设置id进行章节排序！

黑色乏味

已于 2023-12-18 10:38:21 修改

阅读量440

点赞数 11

文章标签： python 网络爬虫

于 2023-12-18 10:32:38 首次发布

本文链接：https://blog.csdn.net/weixin_45727633/article/details/135056293

版权

采集某阁的盗版平台

采集流程:

1.输入小说名称查询到小说的链接

2.解析小说链接获取到每个章节的链接

3.解析章节链接获取内容

4.对内容进行排序保存

首先导入所需库:

1.输入小说名称查询到小说的链接

直接找调用接口，传参即为小说名称

直接上本模块功能代码:

def get_key(keys):
    """
    查询要搜索的小说
    :param keys: 小说名称
    :return:
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
    }
    key_code = parse.quote(keys.encode('utf-8'))
    url = f'https://www.biqg.cc/user/search.html?q={key_code}'
    while True:
        response = requests.get(url=url, headers=headers).text
        if response == '1':
            print(f'请求无效:{response}')
        else:
            response_json = json.loads(response)
            novel_name = re.findall("'articlename': '(.*?)'", str(response_json))
            novel_href = re.findall("'url_list': '(.*?)'", str(response_json))
            info_zip = zip(novel_name, novel_href)
            for zip_info in info_zip:
                if keys == str(zip_info[0]):
                    url = f'https://www.biqg.cc{zip_info[1]}'
                    print(f'查询成功--->名称:{zip_info[0]}---链接:{url}')
                    return url
            print(f'未查询到{keys}的信息')
            return None

运行示例:

2.解析小说链接获取到每个章节的链接

既然获取到小说链接了,就进入小说链接页面进行所有链接的采集

代码:

def get_urls(url):
    """
    获取小说所有章节链接
    :param url:
    :return:
    """
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "cookie": "Hm_lvt_aee414c80f8428eae3eaab28dda30d0a=1702626463; cf_clearance=NOp519MdkLnQoGFfIPOofBJ_pD3k_pQn0.smF4oZcF0-1702626426-0-1-be1d63e.da1a60d5.5ce089de-0.2.1702626426; Hm_lpvt_aee414c80f8428eae3eaab28dda30d0a=1702626469",
        "referer": "https://www.beqege.com/",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43"
    }
    for num in range(5):
        try:
            response = requests.get(url=url, headers=headers)
            html = etree.HTML(response.text)
            url_list = html.xpath('''//div[@class='listmain']//a/@href''')
            if 'http' not in url_list:
                url_list = ['https://www.biqg.cc'+u for u in url_list]
            new_url_list = []
            for new_url in url_list:
                if ':dd_show()' in new_url:
                    continue
                else:
                    new_url_list.append(new_url)
            return new_url_list
        except Exception as e:
            print(f'采集第{num+1}次重试--->失败:{e}--->{url}')

运行示例:

3.解析章节链接获取内容

代码:

async def parse_urls(url):
    """
    异步解析每个url链接的文章
    :return:
    """
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
    }
    for num in range(5):
        try:
            timeout = aiohttp.ClientTimeout(total=15)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(url=url, headers=headers, timeout=timeout) as resp:
                    response = await resp.text()
                    html = etree.HTML(response)
                    title = html.xpath('''//h1[@class='wap_none']//text()''')[0]
                    content = html.xpath('''//div[@id='chaptercontent']//text()''')
                    return title, content
        except Exception as e:
            print(f'采集第{num+1}次重试--->失败:{e}--->{url}')

异步请求所有链接,报错自动重试5次

4.对内容进行排序保存

本次排序逻辑为采集到的链接是有序的，只是异步解析的过程中导致章节错乱，应对方法很简单，我们在采集到所有链接的时候对每个链接进行分配一个id，到时候根据id进行判断。

def optimize_save_info(url_list):
    """
    优化保存信息, 对需要保存的信息进行排序
    :return:
    """
    num_list = []
    info_list = []
    num = 0
    for url in url_list:
        num += 1
        num_list.append(num)
        info_list.append([num, url])
    sort_num = sorted(num_list, reverse=False)
    for nums in sort_num:
        for info in info_list:
            if info[0] == nums:
                title = info[1][0]
                content = info[1][1]
                with open(f'./xs/{key}.txt', 'a', encoding='utf-8') as f:
                    f.write(title + '\n' * 2)
                    for con in content:
                        f.write(con + '\n')
                    f.write('\n' + '=' * 50 + '\n' * 2)
                    print(f'{title}--->保存成功')
                    break

def parse_main(url_list):
    print(url_list)
    loop = asyncio.get_event_loop()
    task = [asyncio.ensure_future(parse_urls(u)) for u in url_list]
    info_list = loop.run_until_complete(asyncio.gather(*task))
    optimize_save_info(info_list)

至此所有代码完毕

整体代码：

import aiohttp
import asyncio
import requests
import json
import re
import time
from lxml import etree
from urllib import parse


def get_key(keys):
    """
    查询要搜索的小说
    :param keys: 小说名称
    :return:
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
    }
    key_code = parse.quote(keys.encode('utf-8'))
    url = f'https://www.biqg.cc/user/search.html?q={key_code}'
    while True:
        response = requests.get(url=url, headers=headers).text
        if response == '1':
            print(f'请求无效:{response}')
        else:
            response_json = json.loads(response)
            novel_name = re.findall("'articlename': '(.*?)'", str(response_json))
            novel_href = re.findall("'url_list': '(.*?)'", str(response_json))
            info_zip = zip(novel_name, novel_href)
            for zip_info in info_zip:
                if keys == str(zip_info[0]):
                    url = f'https://www.biqg.cc{zip_info[1]}'
                    print(f'查询成功--->名称:{zip_info[0]}---链接:{url}')
                    return url
            print(f'未查询到{keys}的信息')
            return None


def get_urls(url):
    """
    获取小说所有章节链接
    :param url:
    :return:
    """
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "cookie": "Hm_lvt_aee414c80f8428eae3eaab28dda30d0a=1702626463; cf_clearance=NOp519MdkLnQoGFfIPOofBJ_pD3k_pQn0.smF4oZcF0-1702626426-0-1-be1d63e.da1a60d5.5ce089de-0.2.1702626426; Hm_lpvt_aee414c80f8428eae3eaab28dda30d0a=1702626469",
        "referer": "https://www.beqege.com/",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43"
    }
    for num in range(5):
        try:
            response = requests.get(url=url, headers=headers)
            html = etree.HTML(response.text)
            url_list = html.xpath('''//div[@class='listmain']//a/@href''')
            if 'http' not in url_list:
                url_list = ['https://www.biqg.cc'+u for u in url_list]
            new_url_list = []
            for new_url in url_list:
                if ':dd_show()' in new_url:
                    continue
                else:
                    new_url_list.append(new_url)
            return new_url_list
        except Exception as e:
            print(f'采集第{num+1}次重试--->失败:{e}--->{url}')


async def parse_urls(url):
    """
    异步解析每个url链接的文章
    :return:
    """
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.43",
    }
    for num in range(5):
        try:
            timeout = aiohttp.ClientTimeout(total=15)
            async with aiohttp.ClientSession(timeout=timeout) as session:
                async with session.get(url=url, headers=headers, timeout=timeout) as resp:
                    response = await resp.text()
                    html = etree.HTML(response)
                    title = html.xpath('''//h1[@class='wap_none']//text()''')[0]
                    content = html.xpath('''//div[@id='chaptercontent']//text()''')
                    return title, content
        except Exception as e:
            print(f'采集第{num+1}次重试--->失败:{e}--->{url}')


def optimize_save_info(url_list):
    """
    优化保存信息, 对需要保存的信息进行排序
    :return:
    """
    num_list = []
    info_list = []
    num = 0
    for url in url_list:
        num += 1
        num_list.append(num)
        info_list.append([num, url])
    sort_num = sorted(num_list, reverse=False)
    for nums in sort_num:
        for info in info_list:
            if info[0] == nums:
                title = info[1][0]
                content = info[1][1]
                with open(f'./xs/{key}.txt', 'a', encoding='utf-8') as f:
                    f.write(title + '\n' * 2)
                    for con in content:
                        f.write(con + '\n')
                    f.write('\n' + '=' * 50 + '\n' * 2)
                    print(f'{title}--->保存成功')
                    break


def parse_main(url_list):
    print(url_list)
    loop = asyncio.get_event_loop()
    task = [asyncio.ensure_future(parse_urls(u)) for u in url_list]
    info_list = loop.run_until_complete(asyncio.gather(*task))
    optimize_save_info(info_list)


if __name__ == '__main__':
    '''https://www.biqg.cc/s?q=%E6%96%97%E7%A0%B4%E8%8B%8D%E7%A9%B9'''
    start_time = time.time()
    key = '斗破苍穹'
    key_url = get_key(key)
    if key_url:
        urls_list = get_urls(key_url)
        parse_main(urls_list)
    print(f'采集完毕,用时{time.time()-start_time}秒')

我们来看一下采集过程: