小说网站:无人生还https://www.xs880.com/html/17516.html
第一章内容:无人生还第一章https://www.xs880.com/html/17516/12036575.html
可以发现相对于正常情况下,该网站一个章节分了几页,这增加了一点小麻烦。。。
不过我们可以先查看源码,右键选择查看源码即可。
重要信息皆已被圈出来,我们只需要获取“下一页”,即<div .class=“read-page”>中<a .href="/html/17516/12036575_2.html">
代码:
import requests
from bs4 import BeautifulSoup
import sys
class download_content(object):
def __init__(self):
self.server = "https://www.xs880.com/"
self.target = "https://www.xs880.com/html/17516/12036575.html"
self.target2 = "https://www.xs880.com/html/17516.html"
self.names = []
self.nums = []
self.urls = []
self.urls2 = []
def get_download_url(self): #每一章开头的链接
req = requests.get(url=self.target2)
req.encoding = 'gbk'
html = req.text
div_bf = BeautifulSoup(html, 'lxml')
div = div_bf.find_all('ul', class_='tlist')
a_bf = BeautifulSoup(str(div[0]), 'lxml')
a = a_bf.find_all('a')
print(a)
self.nums = len(a[:]) # 剔除不必要的章节,并统计章节数
print(self.nums)
for each in a[:]:
#self.names.append(each.string)
self.urls2.append(self.server + each.get('href'))
self.urls2.reverse()
print(self.urls2)
#print(self.names)
def get_urls(self): #一章小说 4页的链接
self.urls.append(self.target)
print("进入循环前:",self.urls)
while self.urls[len(self.urls)-1] != "https://www.xs880.com//html/17516.html":
#req = requests.get(url=tgt)
now_len = len(self.urls)
req = requests.get(url=self.urls[now_len - 1])
req.encoding = 'gbk'
html = req.text
divbf = BeautifulSoup(html, 'lxml')
tit = divbf.find_all('div', class_='article-title mt10')
tit_bf = BeautifulSoup(str(tit[0]), 'lxml')
b = tit_bf.find_all('h1')
b1 = b[0].text.replace('<h1>',' ')
print(b1)
self.names.append(b1)
div = divbf.find_all('div', class_='read-page')
a_bf = BeautifulSoup(str(div[0]), 'lxml')
a = a_bf.find_all('a')
each = a[2]
self.urls.append(self.server + each.get('href'))
print(self.urls[len(self.urls)-1])
print(self.urls)
print(len(self.urls))
def get_contents(self,target):
req = requests.get(url=target)
req.encoding = 'gbk'
html = req.text
bf = BeautifulSoup(html, 'lxml')
texts = bf.find_all('div', class_="size_1")
texts = texts[0].text.replace('\xa0' * 8, '\n\n')
return texts
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = download_content()
dl.get_download_url()
dl.get_urls()
#dl.get_download_url()
print('《无人生还》开始下载:')
for i in range(len(dl.urls)-1):
dl.writer(dl.names[i], 'D:\\deng\\无人生还.txt', dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%.3f%%" % float(i / len(dl.urls)) + '\r')
print(" 已下载:%.3f%%" % float(i / len(dl.urls)) + '\r')
sys.stdout.flush()
print('《无人生还》下载完成')