以爬取某小说网为例,爬取小说《女总裁的全能兵王》
要引用库requests、BeautifulSoup
import requests
from bs4 import BeautifulSoup
进入目录https://www.17k.com/list/1741975.html
寻找html上对应章节的url
代码
import requests
from bs4 import BeautifulSoup
url = 'https://www.17k.com'
r = requests.get("https://www.17k.com/list/1741975.html");//获取html
r.encoding = 'utf-8';
soup = BeautifulSoup(r.text, 'html.parser');
dl = soup.find_all('dl', class_="Volume")//寻找所有标签dl以及class是Volume
aTags = dl[1].find_all('a', target="_blank")//寻找所有的带有href连接的a标签
for x in aTags:
print(x['href']);//打印连接
打印结果:
进入其中一个章节,寻找小说文本规则
完整代码
import requests
from bs4 import BeautifulSoup
url = 'https://www.17k.com'
# r = requests.get("https://www.17k.com/list/3006464.html");
r = requests.get("https://www.17k.com/list/1741975.html");
r.encoding = 'utf-8';
# print(r.text);
soup = BeautifulSoup(r.text, 'html.parser');
# print(soup);
dl = soup.find_all('dl', class_="Volume")
print(len(dl))
# dl_soup = BeautifulSoup(dl);
aTags = dl[1].find_all('a', target="_blank")
for x in aTags:
# print(x.get_text());
print(x['href']);
# 获取子章节内容
r = requests.get(url + x['href']);
r.encoding = 'utf-8';
# print(r.text);
# 解析子章节
soup = BeautifulSoup(r.text, 'html.parser');
# 获取小说章节对应文本div
div = soup.find_all('div', class_="readAreaBox content");
# 查询标题写入文本
h1 = div[0].find_all('h1');
f = open('D:\\CODE\\pyWordSpace\\XS2.txt', 'a');
# print(h1)
f.write(h1[0].get_text() + '\n');
print(div[0]);
# 查询所有小说文本标签p
divp = div[0].find_all('div', class_="p");
ps = divp[0].find_all('p');
# for循环把p写入文本
for y in ps:
f = open('D:\\CODE\\pyWordSpace\\XS2.txt', 'a');
f.write(y.get_text() + '\n');
爬取结果: