笔趣阁小说 python3爬虫实例

最新推荐文章于 2024-08-14 08:38:25 发布

18923489164

最新推荐文章于 2024-08-14 08:38:25 发布

阅读量458

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/AnYeZhiYin/article/details/83118841

版权

爬虫专栏收录该内容

117 篇文章 15 订阅

订阅专栏

import urllib.request

import re

from bs4 import BeautifulSoup as bs

def urlopen(url):

    
    req = urllib.request.Request(url)

    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")

    html = urllib.request.urlopen(req)

    html = html.read()

    return html


def list1(url):

    html = urlopen(url)

    html = bs(html,'lxml')

    list1 = html.div(id="list")
    
    #这个id是所有折链接都在这个DIV里面

    list1 = list1[0]
    #返回的是一个列表所以要取出来

    list1 = list1.find_all('a')
    #再从这个列表里面把所有的链接取出来

    urllist = []
    #这里新建一个列表来装链接

    for i  in list1:
        i = i.attrs
        i = i['href']
        i = 'http://www.biquge.com.tw'+i
        urllist.append(i)

    return urllist


def xia(url):

    urllist = list1(url)

    name = urlopen(url)

    name = bs(name,'lxml')

    name = name.h1.string

    na = name+'.txt'
    #文件名以小说名加.txt


    for i in urllist:
        
        html = urlopen(i)
        html = bs(html,'lxml')
        h1 = html.h1.string
        #这里是小说章节名字
        content = html.div(id="content")
        content = content[0]
        content = content.text
        aa = re.compile('\xa0')
        content = re.sub(aa,'',content)
        #去掉不能解码的内容

        with open( na ,'a')as f:
            
            f.write(h1)
            #写入章节名字

            f.write(content)
            #这里是小说正文

            print('已经下载'+h1)


url= 'http://www.biquge.com.tw/18_18820/'

xia(url)