爬取小说【长生从红楼开始】

import requests
from bs4 import BeautifulSoup
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor


def down_text(url):
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    title = soup.find('h1', class_='wap_none')
    content = soup.find('div', id='chaptercontent')
    if title and content:
        title = title.get_text()
        content = content.get_text()

        with open(f'txt/{title}.txt', 'w') as f:
            f.write(content)
        print(f'{title}下载完毕......')


if __name__ == '__main__':
    start_time = datetime. now().timestamp()
    url = 'https://www.bqka.cc/book/159995/'
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
    }
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    items = soup.find('div', class_='listmain').find_all('a')

    urls = []
    for item in items:
        url = item['href']
        if url != 'javascript:dd_show()':  # 过滤掉那些点击后执行 JavaScript 而不是跳转到实际页面的链接
            url = 'https://www.bqka.cc' + url
            urls.append(url)

    # for url in urls:  # 耗时284秒
    #     down_text(url)

    with ThreadPoolExecutor(max_workers=50) as exe:  # 耗时7秒
        for url in urls:
            exe.submit(down_text, url)
    end_time = datetime.now().timestamp()
    print(f'总共花了{end_time - start_time}seconds')

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值