小说下载脚本

================================

工具准备:

================================
下载与 chome 浏览器版本一致的 chromedriver, chromedriver 国内下载镜像
https://npm.taobao.org/mirrors/chromedriver
将 chromedriver.exe 复制到 python 的scripts目录中, 比如 C:\Anaconda3\Scripts\
并将C:\Anaconda3\Scripts\加到Windows 环境变量PATH 中.  


================================
安装 selenium python 包

================================

pip install selenium

本文共有好多个下载脚本, 是一个不断完善的过程,  所以, 最后一个下载脚本是最通用, 最完美的.

================================
根据章节序号推算单章url地址, 然后下载

================================

from selenium import webdriver
web = webdriver.Chrome()
full_text="小说:穿越种田之将门妻"
full_text=full_text+"\n" +"\n" +"\n"
home_url="https://www.jingcaiyuedu6.com/novel/CW8MY3/"
#web.get('https://www.jingcaiyuedu6.com/novel/CW8MY3/1.html')
chapter_start=1
chapter_end=39  #39
start_page_id=0
for i in range(chapter_start,chapter_end+1):
    page_id=i+start_page_id
    url=home_url+str(page_id)+".html"
    #print("第"+str(i)+"章")
    full_text=full_text+"\n" +"\n" +"\n" +"======================"+"\n"+"第"+str(i)+"章"+ "\n"
    web.get(url)
    #<div id="content">
    content_tag = web.find_element_by_id("content")
    #content_tag = web.find_element_by_class_name("panel panel-default panel-readcontent")
    content = content_tag.text
    full_text=full_text+content
print(full_text)
web.close()

================================

从列表也提取单章url, 然后下载单章文本

================================

#========================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
    """数字转中文"""
    num=str(num)
    new_str=""
    num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
    listnum=list(num)
    # print(listnum)
    shu=[]
    for i in listnum:
        # print(num_dict[i])
        shu.append(num_dict[i])
    new_str="".join(shu)
    # print(new_str)
    return new_str


#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name:         num2chinese
# Author:       yunhgu
# Date:         2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------

_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999

class NotIntegerError(Exception):
    pass

class OutOfRangeError(Exception):
    pass

class Num2Chinese:
    def convert(self, number: int):
        """
        :param number:
        :return:chinese number
        """
        return self._to_chinese(number)

    def _to_chinese(self, num):
        if not str(num).isdigit():
            raise NotIntegerError(u'%s is not a integer.' % num)
        if num < _MIN or num > _MAX:
            raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
        if num < _S4:
            return self._to_chinese4(num)
        elif num < _S8:
            return self._to_chinese8(num)
        else:
            return self._to_chinese16(num)

    @staticmethod
    def _to_chinese4(num):
        assert (0 <= num < _S4)
        if num < 10:
            return _MAPPING[num]
        else:
            lst = []
            while num >= 10:
                lst.append(num % 10)
                num = num // 10
            lst.append(num)
            c = len(lst)  # 位数
            result = u''
            for idx, val in enumerate(lst):
                if val != 0:
                    result += _P0[idx] + _MAPPING[val]
                    if idx < c - 1 and lst[idx + 1] == 0:
                        result += u'零'
            return result[::-1].replace(u'一十', u'十')

    def _to_chinese8(self, num):
        assert (num < _S8)
        to4 = self._to_chinese4
        if num < _S4:
            return to4(num)
        else:
            mod = _S4
            high, low = num // mod, num % mod
            if low == 0:
                return to4(high) + u'万'
            else:
                if low < _S4 // 10:
                    return to4(high) + u'万零' + to4(low)
                else:
                    return to4(high) + u'万' + to4(low)

    def _to_chinese16(self, num):
        assert (num < _S16)
        to8 = self._to_chinese8
        mod = _S8
        high, low = num // mod, num % mod
        if low == 0:
            return to8(high) + u'亿'
        else:
            if low < _S8 // 10:
                return to8(high) + u'亿零' + to8(low)
            else:
                return to8(high) + u'亿' + to8(low)

#========================================
# 从列表页提取单章url, 然后下载单章文本
#========================================
from selenium import webdriver
web = webdriver.Chrome()
num2chinese = Num2Chinese()
full_text="小说:掌家小娘子"
full_text=full_text+"\n" +"\n" +"\n"
print(full_text)
list_url="https://www.baihexs.com/0/54/"
chapter_start=1
chapter_end=306  #306
for i in range(chapter_start,chapter_end+1):
    chinese_chapter_id=num2chinese.convert(i)   #中文数字
    #chinese_chapter_id=str(i)    #阿拉伯数字
    chinese_chapter_name="第"+chinese_chapter_id+"章"
    if chinese_chapter_name.find("百十"):
        chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
    #print(chinese_chapter_name)
    web.get(list_url)   #跳转会列表页, 以便抓取单页的url地址
    url=""
    try:
        url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
    except:
        url=""
    #print(url)
    if  url:
        web.get(url)
        #<dd id="contents">
        #//*[@id="content"]
        #content_tag = web.find_elements_by_css_selector("dd")[2]
        #content_tag = web.find_element_by_id("contents")
        #content_tag = web.find_element_by_class_name("container body-content")
        content_tag = web.find_element_by_xpath('''//*[@id="center"]''')
        content = content_tag.text
    else:
        content="不提供下载"
    chapter_text = "\n" + "\n" + "\n" + "======================" + "\n" + "第" + str(i) + "章" + "\n"
    chapter_text=chapter_text+content
    print(chapter_text)
    full_text=full_text+chapter_text
#print(full_text)
web.close()

================================
每章支持多个分页

作了性能优化

自动输出到文件

增加番外篇下载

代码逻辑优化

================================

#========================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
    """数字转中文"""
    num=str(num)
    new_str=""
    num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
    listnum=list(num)
    # print(listnum)
    shu=[]
    for i in listnum:
        # print(num_dict[i])
        shu.append(num_dict[i])
    new_str="".join(shu)
    # print(new_str)
    return new_str


#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name:         num2chinese
# Author:       yunhgu
# Date:         2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------

_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999

class NotIntegerError(Exception):
    pass

class OutOfRangeError(Exception):
    pass

class Num2Chinese:
    def convert(self, number: int):
        """
        :param number:
        :return:chinese number
        """
        return self._to_chinese(number)

    def _to_chinese(self, num):
        if not str(num).isdigit():
            raise NotIntegerError(u'%s is not a integer.' % num)
        if num < _MIN or num > _MAX:
            raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
        if num < _S4:
            return self._to_chinese4(num)
        elif num < _S8:
            return self._to_chinese8(num)
        else:
            return self._to_chinese16(num)

    @staticmethod
    def _to_chinese4(num):
        assert (0 <= num < _S4)
        if num < 10:
            return _MAPPING[num]
        else:
            lst = []
            while num >= 10:
                lst.append(num % 10)
                num = num // 10
            lst.append(num)
            c = len(lst)  # 位数
            result = u''
            for idx, val in enumerate(lst):
                if val != 0:
                    result += _P0[idx] + _MAPPING[val]
                    if idx < c - 1 and lst[idx + 1] == 0:
                        result += u'零'
            return result[::-1].replace(u'一十', u'十')

    def _to_chinese8(self, num):
        assert (num < _S8)
        to4 = self._to_chinese4
        if num < _S4:
            return to4(num)
        else:
            mod = _S4
            high, low = num // mod, num % mod
            if low == 0:
                return to4(high) + u'万'
            else:
                if low < _S4 // 10:
                    return to4(high) + u'万零' + to4(low)
                else:
                    return to4(high) + u'万' + to4(low)

    def _to_chinese16(self, num):
        assert (num < _S16)
        to8 = self._to_chinese8
        mod = _S8
        high, low = num // mod, num % mod
        if low == 0:
            return to8(high) + u'亿'
        else:
            if low < _S8 // 10:
                return to8(high) + u'亿零' + to8(low)
            else:
                return to8(high) + u'亿' + to8(low)


def get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id):
    """
    get sub_page url
    :param chapter_url: chapter url
    :param sub_page_count: total sub_page count of every chapter
    :param first_sub_page_url_index: first sub_page index with prefix _
    :param sub_page_id:
    :return:
    """
    if sub_page_count==0:
        return chapter_url
    else:
        if sub_page_id<first_sub_page_url_index:
            return chapter_url
        else:
            #https://www.mht99.com/98886/82000964.html
            # https://www.mht99.com/98886/82000964_1.html
            return chapter_url.replace(".html","_"+str(sub_page_id)+".html")


def output(text,file_name):
    """
    output to console and file
    :param text:
    :param file_name:
    :return:
    """
    print(text)
    with open(file_name, 'a+', encoding='utf-8') as f:
        f.write(text+"\n")

def download_chapter(chapter_url, file_name, chapter_webdriver):
    if not chapter_url:
        chapter_content = "不提供下载"
    else:
        chapter_content = ""
        # download 每一章的分页内容
        for j in range(sub_page_count):
            sub_page_id = j + first_sub_page_url_index - 1
            sub_page_url = get_sub_page_url(chapter_url, sub_page_count, first_sub_page_url_index, sub_page_id)
            # print("####第"+ str(sub_page_id)+":" +sub_page_url)

            try:
                try:
                    chapter_webdriver.get(sub_page_url)
                except (WebDriverException, TimeoutException):
                    time.sleep(60)  # sleep 60 seconds, and then try get url again
                    chapter_webdriver.get(sub_page_url)
                try:
                    # content_tag = web2.find_elements_by_css_selector("dd")[2]
                    # content_tag = web2.find_element_by_id("contents")
                    # content_tag = web2.find_element_by_class_name("container body-content")
                    content_tag = chapter_webdriver.find_element_by_xpath(content_tag_xpath)
                    chapter_content = chapter_content + "\n"
                    chapter_content = chapter_content + content_tag.text
                except NoSuchElementException:
                    output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
            except:
                output("####第" + str(sub_page_id) + "页:" + "下载失败", file_name)
    return chapter_content

# ========================================
# 从列表页提取单章url, 然后下载单章文本
# ========================================
import time
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

# config
file_path = r"D:\\"
story_name="农门小娇妻_作者寒月西风"
list_url = "https://dangdu.dangdang.com/catalog/101186.shtml"
chinese_chapter_id_flag = True #中文数字还是阿拉伯数字
sub_page_count = 1  # 每章的页数
first_sub_page_url_index = 1  # 第一个子页url中的下标, https://www.mht99.com/98886/82000964_1.html
chapter_start = 2
chapter_end = 100 # 100
content_tag_xpath = '''/html/body/div[2]/div/div[1]/ul/li'''
#因为是通过章节号部分匹配方式获取url地址, 对于 001 和 1001 两个章节, 将会获取到两个地址, 用这个变量控制, 从哪个章节之后将使用第二个url
choose_2nd_url_from_chapter_id=1000
#设置番外章节
appendix_chapter_urls=[  ]

# init
num2chinese = Num2Chinese()
file_name=file_path+story_name+".txt"
full_text = story_name
output(full_text,file_name)
start_time=datetime.now()
output("下载开始时刻:"+ start_time.strftime("%c"),file_name)
web = webdriver.Chrome()
web2 = webdriver.Chrome()
web2.implicitly_wait(60)  # 设置智能等待 60 seconds, 参考 https://www.cnblogs.com/mengyu/p/6972968.html

# 首先跳转到列表页, 以便抓取单页的url地址, 以便后面用来获取单章的url
web.get(list_url)

for i in range(chapter_start, chapter_end + 1):
    #001、小鱼小蟹
    # get chapter name
    if chinese_chapter_id_flag:
        chinese_chapter_id=num2chinese.convert(i)   #中文数字
    else:
        chinese_chapter_id = str(i)  # 阿拉伯数字
        if len(chinese_chapter_id)<2:   #补前缀 00
            chinese_chapter_id="00"+chinese_chapter_id
        elif len(chinese_chapter_id)<3: #补前缀 0
            chinese_chapter_id="0"+chinese_chapter_id
    chinese_chapter_name = "第" + chinese_chapter_id + "章"
    #chinese_chapter_name=chinese_chapter_id+"章"
    if chinese_chapter_name.find("百十"):
        chinese_chapter_name = chinese_chapter_name.replace("百十", "百一十")
    # print(chinese_chapter_name)

    # 从列表也中获取每章的 url
    chapter_url = ""
    try:
        chapter_url_tags = web.find_elements_by_partial_link_text(chinese_chapter_name)
        chapter_urls=[]
        for url_tag in chapter_url_tags:
            chapter_urls.append(url_tag.get_attribute("href"))
            #print(url_tag.get_attribute("href"))
        if len(chapter_urls)==0:
            chapter_url = ""
        elif i < choose_2nd_url_from_chapter_id or len(chapter_urls)==1:
            chapter_url=chapter_urls[0]
        else:
            chapter_url=chapter_urls[1]
    except Exception as e:
        print(e)
        chapter_url = ""

    # download 每章内容
    chapter_content = download_chapter(chapter_url, file_name, web2)

    # output chapter content
    chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
    chapter_full_text = chapter_title_text + chapter_content
    output(chapter_full_text, file_name)
    full_text = full_text + chapter_full_text



#download 番外章节
for i in range(1, len(appendix_chapter_urls) + 1):
    # get chapter name
    chinese_chapter_name = "番外:" +str(i)
    print(chinese_chapter_name)
    chapter_url=appendix_chapter_urls[i-1]

    # download 每章内容
    chapter_content = download_chapter(chapter_url, file_name, web2)

    # output chapter content
    chapter_title_text = "\n" + "\n" + "\n" + "======================" + "\n" + chinese_chapter_name + "\n"
    chapter_full_text = chapter_title_text + chapter_content
    output(chapter_full_text, file_name)
    full_text = full_text + chapter_full_text

# print(full_text)
output("\n" + "\n" + "\n" + "======================" + "\n", file_name)
end_time=datetime.now()
output("下载结束时刻:"+ end_time.strftime("%c"), file_name)
total_seconds= (end_time-start_time).total_seconds()
output("下载耗时:"+ str(total_seconds) +"秒", file_name)

web.close()
web2.close()

================================
selenium  的更多信息

================================

selenium 不仅支持Python, 还支持Java/C#

https://www.selenium.dev/documentation/zh-cn/webdriver/driver_requirements/
https://www.selenium.dev/documentation/zh-cn/selenium_installation/installing_webdriver_binaries/

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值