python龟速爬取整本小说

最新推荐文章于 2024-01-22 02:04:26 发布

晚风拂柳颜

最新推荐文章于 2024-01-22 02:04:26 发布

阅读量6.7k

点赞数

分类专栏： python小项目文章标签： python爬虫小说爬取

本文链接：https://blog.csdn.net/qq_32394351/article/details/96123668

版权

python小项目专栏收录该内容

32 篇文章 0 订阅

订阅专栏

遇到小说收费，看不过瘾？

然后找了一个盗版的小说网站，随便写了百来行代码，然后龟速开爬。

为啥龟速?毕竟俺单个线程，没有异步，不会正则和xpath，用了个龟速的beautifulsoup来写

但是嘛，重在代码简单易懂，好改。各位看官凑合着用

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# File  : mybs4.py
# Author: DaShenHan&道长-----先苦后甜，任凭晚风拂柳颜------
# Date  : 2019/7/16


from bs4 import BeautifulSoup as bt
import requests
from urllib.request import quote
import time
import os


def findnovel(novelname="修仙",homeurl="https://www.biqule.com/"):
    keyword = quote(novelname, encoding='gb2312')  #对搜索的关键字进行编码
    url = f'{homeurl}modules/article/search.php?searchkey={keyword}'
    print(url)
    r = requests.get(url)  # 笔趣阁搜索url
    r.encoding = r.apparent_encoding
    # print(r.status_code) 打印访问状态，一般200就是成功
    html = r.text
    soup = bt(html, 'html.parser')
    # print(soup)    soup格式化后的网页源码
    chapters1 = soup.find_all("li")
    soup1 = bt(str(chapters1), 'html.parser')
    chapters2 = soup.find_all("span",class_='s2')
    soup2 = bt(str(chapters2), 'html.parser')
    arr1 = []  # 章节内容列表
    lis=[]  #arr 元组列表
    info = {} #最终信息字典
    for child in soup1.children:
        soup3 = bt(str(child), 'html.parser')
        chapters3 = soup3.find_all("span")
        arr = []  # 单个li内所有span标签的文本
        for i in chapters3:
            if i.get_text()!="":
                arr.append(i.get_text())
        arr = tuple(arr)
        if arr != ():
            lis.append(arr)

    for i in lis:
        info[i[1]]=(i[3],i[0],i[1],i[2],i[4],i[5]) #key为小说名。value为小说信息

    for child in soup2.children:
        if hasattr(child, 'href') and child.a is not None:
            bookname = child.get_text()
            bookurl = child.a['href'] #提取a标签内href属性的值
            book = [bookname,bookurl,info[bookname][0]]
            arr1.append(book)

    return arr1

def download(
        homeurl="https://www.biqule.com/book_72715/",
        savename="重生之都市修仙.txt"):
    """
    笔趣乐专用小说下载器  https://www.biqule.com
    :param homeurl: 小说主页  如 https://www.biqule.com/book_72715/
    :param savepath: 保存路径（在同目录下downloads里面路径）
    :return:
    """
    if not os.path.exists("download"):
        os.mkdir("download")
    savepath = f"download/{savename}"
    r = requests.get(homeurl)
    r.encoding = r.apparent_encoding
    ret = r.text
    soup = bt(ret, 'html.parser')
    chapters = soup.find_all("dd")
    soup = bt(str(chapters), 'html.parser')
    arr = []  # 章节内容列表
    for child in soup.children:  # dd下所有子节点
        if hasattr(child, 'href') and child.a is not None:
            arr.append(child.get_text())
    num = len(arr)
    print(f"本小说共有 {num} 章节")
    file = open(savepath, 'a', encoding='utf-8')
    downsouplist(homeurl, soup, file, num)


def downsouplist(url, soup, file, num):  # 下载soup对象的所有
    index = 1
    for child in soup.children:
        if hasattr(child, 'href') and child.a is not None:
            myurl = url + child.a['href']
            downhtml(myurl, file, index, num)
            index += 1
    file.close()


def downhtml(url, file, index, num):  # 下载单独的一个html文件
    r = requests.get(url)
    r.encoding = r.apparent_encoding
    ret = r.text
    soup = bt(ret, 'html.parser')
    contents = soup.find_all(id='content')
    for txtcode in contents:
        content = txtcode.get_text()
        file.write(content + '\n\n')
    print("已下载:%.3f%%" % float(index / num * 100))  # 爬取进度

def downtest(homeurl="https://www.biqule.com/book_72715/",savename="重生之都市修仙.txt"): #测试下载
    time1 = time.time()  # 获取当前时间（秒）
    download(homeurl,savename)
    time2 = time.time()
    tt = time2 - time1
    print(f'花费时间：{tt}秒')
def mydownload(novelname="魔皇",authorname="八月飞鹰"):
    ret = findnovel(novelname)
    print(ret)
    for i in ret:
        if i[2] == authorname:
            homeurl = i[1]
            savename = novelname+".txt"
            downtest(homeurl,savename)
            break

def readme():
    jiaocheng = """
    ===========================达神beatifulsoup教程===============================
    soup = bt(ret, 'html.parser')   先构建beatifulsoup对象，然后关键是find_all用法
    contents = soup.find_all(id='content') 
    find_all(name,text,attrs)    查找标签 , 查找文本 , 基于attrs参数
    
    用法1: li = soup.find_all('li')   查找标签为li的并返回一个列表li
    用法2: li = soup.find_all (id = 'flask')   查看id=值
    用法3: li = soup.find_all (class_='s2')   查看class=值 注意，由于class是py的关键字，soup里必须后面跟上下划线
    用法4：find_class = soup.find(attrs={'class':'s2'}) 在attrs里面传入json文本
    
    组合起来使用: 如需要找span下class为s2的:
    find_ret = soup.find_all("span",class_="s2")
    ==============================================================================
    """
    print(jiaocheng)
def findNovelList(novelname):
    ret = findnovel(novelname)
    for i in ret:
        print(f"小说名称:{i[0]}    作者:{i[2]}         主页:{i[1]}")

if __name__ == '__main__':
    readme()
    # downtest()
    # mydownload()
    # findNovelList("紫阳")
    downtest('https://www.biqule.com/book_47626/',"紫阳帝尊.txt")