python-小说爬虫

python-爬虫小说
这段时间正在学习python,正好写了个爬虫的代码。话不多说,直接上代码。(不足之处,请多多指教!)
在这里爬取了56书库网站的小说。

# -*- coding:utf-8 -*-
"""
爬取《牧神记》
网址:http://www.56shuku.org/files/article/html/138/138911/
"""
import requests
from bs4 import BeautifulSoup
import re
import os
import threading
import random
import time
url_1 = []  # 用于存储每章小说的url
url_2 = []

# 获取每章节小说的url
def get_url(URL):
    res = requests.get(url=URL)
    html = res.content.decode()
    html = html.encode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    url_text = soup.find_all("div", class_="dccss")
    url_text = BeautifulSoup(str(url_text[:]), "html.parser")
    href = url_text.find_all("a")
    for i in href:
        i = str(i)
        url_2 = re.findall("<a href=\"(.*?)\">", i) # 获取到的每一章节的url
        for j in url_2:
            url_1.append(URL + j) # 将每章节的url存放到一个列表中
    return url_1

# 将每章小说的内容保存到本地文本中
def get_text(url, path="G:/小说/剑来_副本.txt"):
    agent_list = ['Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
                  'Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
                  'Opera/8.0 (Windows NT 5.1; U; en)',
                  'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
                  'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                  'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                  'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
                  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
                  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)',
                  'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
                  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
                  ]
    headers = {
        "User-Agent": random.choice(agent_list)
    }
    response = requests.get(url=url, headers=headers)
    html = response.content
    html = html.decode()
    html = html.encode("utf-8")
    soup = BeautifulSoup(html, "lxml")
    # print(soup)
    name = soup.find("div", align="center")
    name1 = name.h1.string
    # print(name1)
    text = soup.find_all("div", id="content")
    text = text[0].text.replace("\xa0"*4, "\n\n") #将不间断空白符替换成换号符
    with open("{}".format(path), "a", encoding="utf-8") as f:
        f.writelines(name1)
        f.writelines(' ' * 2 + text + "\n\n\n")
        f.close()

# 创建存放小说的文本,并打印小说下载进度以及耗时
def load_text(path):
    if os.path.exists(path):
        os.remove(path)

    get_url(URL)
    num = len(url_1)
    m = 1
    start_time = time.time()
    for url in url_1:
        print("共%d章,正在下载第%d章" % (num, m))
        get_text(url, path)
        m += 1
    end_time = time.time()
    run_time = end_time - start_time
    print("下载完成!")
    print("此本小说下载耗时%d秒!" % run_time)


if __name__ == '__main__':
    URL = "http://www.56shuku.org/files/article/html/138/138911/"
    path = r"F:\workspace\小说\牧神记.txt"
    thread1 = threading.Thread(target=load_text, args=(path,))
    thread1.start()

程序运行的截图如下:
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值