VIP章节不能下载 其它都可以
import urllib.request
import re
from bs4 import BeautifulSoup as bs
def urlopen(url):
req = urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
html = urllib.request.urlopen(req)
html = html.read()
return html
def listurl(url):
html = urlopen(url)
html = bs(html,'lxml')
url = html.dt.a.attrs
url = url['href']
html =urlopen(url)
html = html.decode('utf-8')
ff = html.find('正文')
aa = html.find('BAIDU_banner_bottom')
html = html[ff:aa]
list1 = []
lis = re.findall(r'/.*?/.*?/.*?\.html',html)
for i in lis:
i = 'http://www.17k.com'+i
list1.append(i)
return list1
def xia(url):
list1 = listurl(url)
na = urlopen(url)
na = bs(na,'lxml')
na = na.h1.a.string
na = na+'.txt'
#这个是小说的名字
for i in list1:
html = urlopen(i)
html = bs(html,'lxml')
name = html.h1.string
name = name.strip()
#name为章节标题
content = html.find_all('div',class_="p")
content = content[0].text
#content 为内容
with open(na,'a')as f:
f.write(name)
f.write(content)
print('已经下载'+name)
#不能下载VIP章节
url = 'http://www.17k.com/book/2849619.html'
#这里放小说的链接就可以了
xia(url)