电视剧日更一集看不够,那就爬原著看看
- 主要使用
requests
和BeautifulSoup
模块
import requests
from bs4 import BeautifulSoup
- 准备工作,解析网页小说第一章
url = 'http://book.zongheng.com/chapter/189169/3431546.html'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html)
- 尝试提取第一章的标题和内容
div = soup.find('div',itemprop="acticleBody")
content = div.get_text()
div = soup.find('div',"title_txtbox")
title = div.get_text()
print(title)
print(content)
- 获取所有章节的目录并储存在list中,其中目录网址为
response = requests.get('http://book.zongheng.com/showchapter/189169.html')
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html)
list = soup.find('div',"volume-list");
list = list.find_all('div');
list = list[3].find_all('li');
- 循环目录并写入txt
for i in range(0,160):
url = list[i].a['href']
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text;
soup = BeautifulSoup(html);
div = soup.find('div',itemprop="acticleBody");
content = div.get_text();
div = soup.find('div',"title_txtbox");
title = div.get_text();
with open('雪中悍刀行.txt','a+',encoding='utf-8') as f:
f.write(title)
f.write(content)
f.write('\n')
print("已写入"+title)
1、https://blog.csdn.net/ChenMugao/article/details/107831887
2、https://blog.csdn.net/weixin_39977276/article/details/111078900