python小说爬虫 requests+pyquery+多线程

python小说爬虫 requests+pyquery+多线程

import requests
from pyquery import PyQuery as pq
from concurrent.futures import ThreadPoolExecutor


# 下载
def download(url, encoding="utf-8"):
    try:
        response = requests.get(url)
        response.encoding = encoding
        if response:
            return response.text
    except Exception as e:
        print(e)
    return ""


# 获取章节正文
def getContent(url, chapterName):
    doc = pq(download(url))
    content = doc("#content")
    print(chapterName + "下载完毕")
    return chapterName + "\n" + str(content.text()).replace("\n\n", "\n").replace("\n。", "")


def getNovel(chapters_list_url):
    soup = pq(download(chapters_list_url))
    info = soup("#info")
    novel = info.find("h1").text() + "\n" + info.find("p").eq(0).text() + "\n"
    chapters = soup("#list")

    # 开启线程池
    executor = ThreadPoolExecutor(max_workers=5)
    items = list(chapters.items("a"))[12:]
    content_list = []
    for a in items:
        title = a.text()
        url = "https://www.biquge.tw" + a.attr("href")
        work = executor.submit(getContent, url + a.attr("href"), title)
        content_list.append(work)

    for work in content_list:
        novel += work.result() + "\n"
    executor.shutdown()
    return novel


def saveNovel(content, path):
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
        f.flush()
        f.close()


if __name__ == '__main__':
    novel = getNovel("https://www.biquge.tw/509_509388/")
    saveNovel(novel, novel[:novel.find("\n")]+".txt")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值