python 爬虫：下载小说

最新推荐文章于 2024-06-24 18:45:00 发布

yinxinxu

最新推荐文章于 2024-06-24 18:45:00 发布

阅读量383

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/xuyinxin/article/details/91344844

版权

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

最近在水滴阅读看英文原著《绿野仙踪》，在PP作文中下载中文版本.

下载到第12章的时候，好像是html网页出错，拿不到下一章的url. 跳过12章，将首页地址更新为第13章，可以继续下

#coding=utf-8

import sys
import  io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

from bs4 import BeautifulSoup
import urllib.request
import os


base_dir = "D:/python/src/lvyexianzong/"

base_url = "https://www.ppzuowen.com/"


def parseURL(url):
    if url.find("http",0) == -1:
        url=base_url+url
    req0 = urllib.request.Request(url)
    req0.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36") #伪装成浏览器
    html0 = urllib.request.urlopen(req0).read()
    soup0 = BeautifulSoup(html0,'lxml')
    
    try:
        div_top = soup0.find('div',class_="articleBody articleContent1")
        h2_string = div_top.find('h2').string 
        if h2_string is not None:
            fileName = h2_string
        else:
            fileName = "unknown"
        
        content = div_top.find('p').get_text().replace("<br>","").replace('"',"")
        
        with open(base_dir+fileName+".txt",'w',encoding='utf-8-sig') as f:
            f.write(" "*10 + fileName + "\n"*2)
            f.write(" "*4 + content)
            print("downloading " + fileName + "  finished")
    except:
        print("parse" + url + " error")
    
    finally:
        try:

            buttom = soup0.find('div',class_="www3")
            pages = buttom.find_all('span',class_="www4")
            for item in pages:
                try:
                    a = item.find('a')
                    if a == None:
                        pass
                    else:
                        name = a.string
                        if name.find("下一",0) != -1:
                            return a['href']
                        
                except:
                    pass
        except:
            print("Parse error")
        
    

def main():
    url = "https://www.ppzuowen.com/book/lvyexianzong/9419.html"
    while True:
        if url != None:
            url = parseURL(url)
        else:
            break


if __name__ == "__main__":
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    main()
    print("Exit!!")

结果：