17k小说下载爬虫实例

最新推荐文章于 2024-04-21 08:52:18 发布

18923489164

最新推荐文章于 2024-04-21 08:52:18 发布

阅读量507

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/AnYeZhiYin/article/details/83110199

版权

爬虫专栏收录该内容

117 篇文章 15 订阅

订阅专栏

VIP章节不能下载其它都可以

import urllib.request

import re

from bs4 import BeautifulSoup as bs

def urlopen(url):

    
    req = urllib.request.Request(url)

    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")

    html = urllib.request.urlopen(req)

    html = html.read()

    return html

def listurl(url):
    html = urlopen(url)

    html = bs(html,'lxml')

    url = html.dt.a.attrs

    url = url['href']

    html =urlopen(url)

    html = html.decode('utf-8')

    ff =  html.find('正文')

    aa =  html.find('BAIDU_banner_bottom')

    html = html[ff:aa]

    list1 = []

    lis = re.findall(r'/.*?/.*?/.*?\.html',html)

    for i in lis:
        
        i = 'http://www.17k.com'+i
        
        list1.append(i)
        
    return list1


def xia(url):

    list1 = listurl(url)

    na  = urlopen(url)
    na = bs(na,'lxml')
    na = na.h1.a.string
    na = na+'.txt'
    #这个是小说的名字

    for i in list1:
        html = urlopen(i)

        html = bs(html,'lxml')
        name = html.h1.string
        name = name.strip()
        #name为章节标题

        content = html.find_all('div',class_="p")

        content = content[0].text

        #content 为内容

        with open(na,'a')as f:
            f.write(name)
            f.write(content)
            print('已经下载'+name)
        #不能下载VIP章节


url = 'http://www.17k.com/book/2849619.html'

#这里放小说的链接就可以了

xia(url)