需要用到requests,bs4
先把每一章的url爬出来
第一章url=https://www.zwdu.com/book/50553/22940129.html
最后一章url=https://www.zwdu.com/book/50553/23509728.html
新书。。。。
这里找到编码和下一章的标签,代码直接就写出来了
# coding=gbk
import requests
from bs4 import BeautifulSoup
url='https://www.zwdu.com/book/50553/22940129.html' #第一章
while url!='https://www.zwdu.com/book/50553/23509728.html': #最后一章
res = requests.get(url)
res.encoding = "gbk" #编码
soup = BeautifulSoup(res.text, 'html.parser')
a = soup.select("div[class=bottem1]>a:nth-last-child(2)")
#CSS选择器,找到倒数第二个a
for i in a:
remain = i['href'] #a标签中的herf赋给remian
url = 'https://www.zwdu.com{}'.format(remain) #添加形成完整的url
print(url)
print("url导入结束") #这样就能爬小说了
完整爬小说代码:
# coding=utf-8 #这里需要更改一下
import requests
from bs4 import BeautifulSoup
url='https://www.zwdu.com/book/50553/22940129.html'
while url!='https://www.zwdu.com/book/50553/23509728.html':
res = requests.get(url)
res.encoding = "gbk"
soup = BeautifulSoup(res.text, 'html.parser')
a = soup.select("div[class=bottem1]>a:nth-last-child(2)")
content = soup.find('div', {'id': 'content'})
bookname = soup.find('div', {'class': 'bookname'})
for i in a:
remain = i['href']
url = 'https://www.zwdu.com{}'.format(remain)
with open('练习小说1.txt', 'a',encoding='utf-8') as f:
f.write(bookname.h1.text + '\r\n')
for i in content:
f.write(str(i).replace("<br/>","\n").replace(" ",' '))
print(bookname.h1.text + '---下载成功!')
print("---------------小说下载完毕--------------")
这样就好了