分析
书名(保存文件名):
reader_crumb = bs.find_all('div', {'class': 'reader_crumb'})[0]
bookname: str = reader_crumb.find_all('a')[2].text
章节:
chapter = bs.find('div', {'class': 'title_txtbox'}).text
正文:
把p标签挨个遍历后放到list中,最后’/n’.join(list)写入到本地文件
下一章:
找到对应a标签的href然后继续跳转。href: str = a[‘href’]
最后一章判断:
可以看到,在最后一章中,下一章的a标签对应的href为javascript:void(0),可以作为判断条件,结束循环。
源码
最终效果:
import requests
import os
from bs4 import BeautifulSoup
def spiderBook(url: str):
res = requests.get(url)
res.encoding = 'utf-8'
bs = BeautifulSoup(res.text, 'html.parser')
# 找书名
reader_crumb = bs.find_all('div', {'class': 'reader_crumb'})[0]
bookname: str = reader_crumb.find_all('a')[2].text
print('找到书名:' + bookname)
path = 'E:/Ebooks/' + bookname
if not os.path.exists(path):
os.makedirs(path)
print('建立目录:{0}'.format(bookname))
pathAllinOne: str = path + '/' + bookname + '(合并).txt'
while True:
# 章节名
chapter = bs.find('div', {'class': 'title_txtbox'}).text
# 本章路经
slice: str = path + '/' + chapter + '.txt'
list = []
list.append(chapter)
for p in bs.find('div', {'class': 'content', 'itemprop': 'acticleBody'}).find_all('p'):
list.append(p.text)
content = '\n'.join(list)
# 写入章节
with open(slice, 'w') as f:
f.write(content)
print("已保存章节:" + chapter)
# 写入合并
with open(pathAllinOne, 'a') as f:
f.write(content)
for a in bs.find('div', {'class': 'chap_btnbox'}).find_all('a'):
if a.has_attr('class'):
className: str = a['class'][0]
if className == 'nextchapter':
href: str = a['href']
if 'javascript:void(0)' in href:
return
else:
res = requests.get(href)
res.encoding = 'utf-8'
bs = BeautifulSoup(res.text, 'html.parser')
continue
if __name__ == '__main__':
url = 'http://***.com/chapter/898410/58676024.html'
spiderBook(url)