闲着没事爬了本西游记,分享下

import requests
import os,time
from lxml import etree
from fake_useragent import UserAgent

def get_html(url):
    ua = UserAgent()
    headers = {'UserAgent': ua.random}
    response = requests.get(url,headers=headers)
    response.encoding = response.apparent_encoding
    return response

def be_tree(url):
    r = get_html(url)
    tree = etree.HTML(r.text)
    return tree

def get_mulu_lists(mulu_url):
    tree = be_tree(mulu_url)
    novel_name = tree.xpath('//h1/span[1]/b/text()')[0]
    cha_urls = tree.xpath('//ul/span/a/@href')
    titles = tree.xpath('//ul/span/a/text()')
    return novel_name,titles,cha_urls

def down_onechapter(novel_name,down_url):
    tree = be_tree(dow_url)
    datas = tree.xpath('//div[1]/div/p/text()')
    for data in datas:
        with open(f'./{novel_name}.txt','a',encoding='utf-8')as f:
            f.write(data)
#写入2行空字符,以便章节容排版
    with open(f'./{novel_name}.txt', 'a', encoding='utf-8')as f:
        f.write('\n')
        f.write('\n')
    print('下载完成')

if __name__ == '__main__':
    start = time.time()
    # 西游记目录,其他书籍替换链接即可
    url = 'https://so.gushiwen.cn/guwen/book_46653FD803893E4FBF8761BEF60CD7D9.aspx'
    base_url =url.split('/guwen')[0]
    novel_name, titles, cha_urls = get_mulu_lists(url)
    for title,cha_url in zip(titles,cha_urls):
        dow_url = base_url + cha_url
        print(title,dow_url)
        with open(f'./{novel_name}.txt','a',encoding='utf-8')as f:
            f.write(title)
            f.write('\n')
        down_onechapter(novel_name,dow_url)
        print('全本下载完成')
    end = time.time()
    use_time = int(end) - int(start)
    print(f'下载耗时{use_time}秒')


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值