17k小说下载爬虫实例

VIP章节不能下载 其它都可以

import urllib.request

import re

from bs4 import BeautifulSoup as bs

def urlopen(url):

    
    req = urllib.request.Request(url)

    req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")

    html = urllib.request.urlopen(req)

    html = html.read()

    return html

def listurl(url):
    html = urlopen(url)

    html = bs(html,'lxml')

    url = html.dt.a.attrs

    url = url['href']

    html =urlopen(url)

    html = html.decode('utf-8')

    ff =  html.find('正文')

    aa =  html.find('BAIDU_banner_bottom')

    html = html[ff:aa]

    list1 = []

    lis = re.findall(r'/.*?/.*?/.*?\.html',html)

    for i in lis:
        
        i = 'http://www.17k.com'+i
        
        list1.append(i)
        
    return list1


def xia(url):

    list1 = listurl(url)

    na  = urlopen(url)
    na = bs(na,'lxml')
    na = na.h1.a.string
    na = na+'.txt'
    #这个是小说的名字

    for i in list1:
        html = urlopen(i)

        html = bs(html,'lxml')
        name = html.h1.string
        name = name.strip()
        #name为章节标题

        content = html.find_all('div',class_="p")

        content = content[0].text

        #content 为内容

        with open(na,'a')as f:
            f.write(name)
            f.write(content)
            print('已经下载'+name)
        #不能下载VIP章节


url = 'http://www.17k.com/book/2849619.html'

#这里放小说的链接就可以了

xia(url)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值