#目标,把整本小说都爬取下来,并储存到本地
#目标url:https://www.liaobige.com/dush/44901/
#爬取整本黑客小说所有章节,并保存到本地
import requests #自动爬去html页面,自动请求网络提交
from bs4 import BeautifulSoup #解析HTML/XMl页面,提取数据或信息
import time
def Zj():
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
url = requests.get("https://www.liaobige.com/dush/44901/", headers=headers)
#print("返回值:",url.status_code)
url.encoding = url.apparent_encoding #网页编码转换
text = url.text
soup = BeautifulSoup(text, 'html.parser') #网页解析器
urls = 'https://www.liaobige.com/dush/44901/'
zj = []
for i in soup.select("body div div div span a"):
ss = urls + i.get('href') #使用get爬取标签中的属性
zj.append(ss)
return zj
def Bt():
for i in Zj():
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
time.sleep(3)
urlxx = requests.get(i, headers=headers)
dome = urlxx.status_code
urlxx.encoding = urlxx.apparent_encoding #网页编码转换
texts = urlxx.text
soup = BeautifulSoup(texts, 'html.parser') #网页解析器
if dome == 200:
bt = soup.select("div h2")
bts = [i.string for i in bt]
for i in bts:
try:
file = open("D:\\IT\\小说\\" + i + ".txt", "a+",encoding='utf-8')
for y in soup.find_all(class_="vcontent"):
a = y.get_text()
file.write(a)
file.close()
print(i)
except:
print('本章节源代码错误!')
else:
print('访问失败:',dome)
Bt()
运行结果: