-
目录
1.项目设计
2.代码:编程获取小说指定章节的内容
3.完善代码:获取小说目录,遍历完成小说下载
4.代码优化
5.对比上次模仿练习代码 -
项目设计
1.功能描述
1)下载一部玄幻小说
2)输出:保存到本地txt
3)数据源:小说内容静态存储于网页中,无robots限制
2.结构设计
1)从笔趣网选择一本玄幻小说,先解析获取指定章节小说的内容
2)分析小说目录网页,获取小说目录及网址
3)遍历下载小说所有章节
4)输出到本地文档 -
指定小说章节的内容
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
class spider(object):
def __init__(self, url):
self.url = url
self.kv = {'user-agent':'Mozilla/5.0'}
self.lsText = []
def getHtmlText(self):
try:
req = requests.get(self.url, headers=self.kv)
req.raise_for_status
print(req.status_code)
return req.text
except Exception as e:
return "getHtmlText产生异常:{}".format(e)
def parserHtmlText(self, html):
soup = BeautifulSoup(html)
bf = soup.find_all('div', id="content")
self.ls.append(bf[0].text.replace('\xa0'*8, '\n').replace('\r', '').replace(' ', ''))
# print(bf[0].text.replace('\xa0'*8, '\n')) #1 文本
# print((i.replace('\xa0'*8, '').replace('\r', '') for i in bf[0].strings if i != ' ')) #2 generator,如若想得到与#1相同结果,for遍历
# for i in (i.replace('\xa0'*8, '').replace('\r', '') for i in bf[0].strings if i != ' '):
# print(i)
def writeToFile(self):
with open("黑铁之堡.txt", 'w') as f:
f.write(self.ls)
if __name__ == '__main__':
url = 'https://www.biqukan.com/2_2892/1254662.html'
sp = spider(url)
html = sp.getHtmlText()
# print(html)
sp.parserHtmlText(html)
print(sp.ls)
# sp.writeToFile()
- 小说目录获取,遍历下载至本地
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
class spider(object):
def __init__(self):
self.kv = {'user-agent':'Mozilla/5.0'}
self.lsText = []
self.lsUrls = []
self.lsChapters = []
def getHtmlText(self, url):
try:
req = requests.get(url, headers=self.kv)
req.raise_for_status
req.encoding = req.apparent_encoding
return req.text
except Exception as e:
return "getHtmlText产生异常:{}".format(e)
def parserHtmlText(self, html):
soup = BeautifulSoup(html, 'html.parser')
bf = soup.find_all('div', id="content")
self.lsText.append(bf[0].text.replace('\xa0', '').replace('\r', '').replace(' ', ''))
def parserHtmlUrl(self, html, url):
soup = BeautifulSoup(html, 'html.parser')
div = soup.find_all('div', class_="listmain")
bf = BeautifulSoup(str(div[0]), 'html.parser')
for i in bf.find_all('a')[12:]:
self.lsChapters.append(i.string)
self.lsUrls.append(url + (i.get('href').split('/'))[-1])
def writeToFile(self):
with open("黑铁之堡.txt", 'w', encoding='utf-8') as f:
f.write("《黑铁之堡》\n\n\n")
for num in range(len(self.lsText)):
f.write(self.lsChapters[num]+'\n')
f.write(self.lsText[num]+'\n\n\n')
if __name__ == '__main__':
url = 'https://www.biqukan.com/2_2892/'
sp = spider()
html = sp.getHtmlText(url)
sp.parserHtmlUrl(html, url)
cnt = 0
for u in sp.lsUrls:
print(cnt)
sp.parserHtmlText(sp.getHtmlText(u))
cnt += 1
sp.writeToFile()
[Finished in 2945.7s]
- 优化
# -*- coding: utf-8 -*-
'''
# 边下载边写入txt
'''
import requests
from bs4 import BeautifulSoup
class spider(object):
def __init__(self):
self.kv = {'user-agent':'Mozilla/5.0'}
self.lsText = []
self.lsUrls = []
self.lsChapters = []
def getHtmlText(self, url):
try:
req = requests.get(url, headers=self.kv)
req.raise_for_status
req.encoding = req.apparent_encoding
return req.text
except Exception as e:
return "getHtmlText产生异常:{}".format(e)
def parserHtmlText(self, html):
soup = BeautifulSoup(html, 'html.parser')
bf = soup.find_all('div', id="content")
texts = bf[0].text.replace('\xa0', '').replace('\r', '').replace(' ', '')
return texts
def parserHtmlUrl(self, html, url):
soup = BeautifulSoup(html, 'html.parser')
div = soup.find_all('div', class_="listmain")
bf = BeautifulSoup(str(div[0]), 'html.parser')
for i in bf.find_all('a')[12:]:
self.lsChapters.append(i.string)
self.lsUrls.append(url + (i.get('href').split('/'))[-1])
def writeToFile(self, texts, cnt):
with open("黑铁之堡.txt", 'w', encoding='utf-8') as f:
f.write("《黑铁之堡》\n\n\n")
f.write(self.lsChapters[cnt]+'\n')
f.write(texts+'\n\n')
if __name__ == '__main__':
url = 'https://www.biqukan.com/2_2892/'
sp = spider()
html = sp.getHtmlText(url)
# print(html)
sp.parserHtmlUrl(html, url)
cnt = 0
for u in sp.lsUrls:
print(cnt)
html = sp.getHtmlText(u)
texts = sp.parserHtmlText(html)
sp.writeToFile(texts, cnt)
cnt += 1
[Finished in 2453.7s]
- 对比之前模仿练习代码
# -*- coding:utf-8 -*-
# 浏览器google
from bs4 import BeautifulSoup
import requests, sys
class downloads(object):
def __init__(self):
self.server = 'https://www.qu.la/'
self.target = 'https://www.qu.la/book/16431/'
self.nums = 0
self.names = []
self.urls = []
def get_urls(self):
'''
获取章节名,网址
'''
req = requests.get(url=self.target)
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[12:])
for i in a[12:]:
self.names.append(i.string)
self.urls.append(self.server + i.get('href'))
#print(i.string)
def downloads(self, content_url):
'''
获取正文
'''
req = requests.get(content_url)
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', id='content')
texts = div[0].text.replace('\xa0'*4, '\n')
return texts
def write(self, name, path, content):
'''
写入文档
'''
with open(path, 'a', encoding='utf-8') as fout:
fout.write(name)
fout.writelines(content + '\n')
if __name__ == '__main__':
ll = downloads()
ll.get_urls()
#ll.downloads(ll.urls[0])
sys.stdout.write('开始下载:《**》')
for i in range(ll.nums):
ll.write(ll.names[i], 'test2.txt', ll.downloads(ll.urls[i]))
sys.stdout.write('进度:%.3f' % (i/ll.nums))
sys.stdout.flush()
if i == 50:
break
sys.stdout.write('下载结束:<**>')