一、准备工作
1.背景介绍
小说网站:新笔趣阁
2.爬虫步骤
大致分三个步骤:
- 发起请求:明确如何发起Http请求,获取到数据;
- 解析数据:获取到杂乱的数据,对数据进行清理;
- 保存数据:保存为自己想要的格式。
发起请求就用requests
解析数据有xpath、Beautiful Soup、正则表达式等,本文用BeautifulSoup
保存数据:用常规的文本保存,后续继续用docx和xlsx保存
3.Beautiful Soup
pip install bs4
4.小试牛刀
下载《斗破苍穹》部分章节
首先审查页面元素,分析第一章url
from typing import ChainMap
import requests
from bs4 import BeautifulSoup
import sys
def get_contents(server,target):
url=server+target
req=requests.get(url=url)
req.encoding='utf-8'
html=req.text
bf=BeautifulSoup(html,'lxml')
texts=bf.find('div',id='content')
content=texts.text.replace('\xa0'*4,'\n\n')
return content
def get_urls(target):
chapters=[]
urls=[]
nums=0
req=requests.get(url=target,verify=False) #加上verify=false可避免get时指定ssl证书
req.encoding='utf-8' #避免乱码
html=req.text
bs=BeautifulSoup(html,'lxml')
a=bs.find('div',id="list")
a=a.find_all('a')[100:200]
nums=len(a)
for each in a:
urls.append(each.get('href'))
chapters.append(each.string)
return urls,chapters,nums
def writer(path,name,text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name+'\n')
f.writelines(text)
f.write('\n\n')
if __name__=='__main__':
server='https://www.vbiquge.com'
target='https://www.vbiquge.com/1_1413/'
book_name='斗破苍穹.txt'
urls,chapters,nums=get_urls(target)
for i in range(nums):
writer(book_name,chapters[i],get_contents(server,urls[i]))
sys.stdout.write("已下载:{0}/{1}{2}".format(i,nums,'\r'))
sys.stdout.flush()
- 面向对象
from bs4 import BeautifulSoup
import requests
import sys
"""
类说明:下载《笔趣网》小说《斗破苍穹》
parameters:
无
returns:
无
Modify:
21.2.18
"""
class downloader(object):
def __init__(self):
self.server = 'https://www.bqkan.com'
self.target = 'https://www.bqkan.com/1_1094/'
self.chapters = [] # 存放章节名
self.urls = [] # 存放章节链接
self.nums = [] # 章节数
"""
函数说明:获取下载链接
"""
def get_download_url(self):
req = requests.get(url=self.target)
'''
查看网页的源码发现网页的编码方式gbk,BeautifulSoup解析后得到的soup,打印出来是乱码,实际上其本身已经是正确的(从原始的GB2312编码)解析(为Unicode)后的了。之所以乱码,那是因为,打印soup时,调用的是__str__,其默认是UTF-8,所以输出到GBK的cmd中,才显示是乱码
'''
req.encoding = 'gb18030' #确保内容不乱码
div_bf = BeautifulSoup(req.text,'lxml')
div = div_bf.find_all('div', class_='listmain')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[15:]) # 剔除前15章
for each in a[15:]:
self.chapters.append(each.string) #a标签中的章节名
self.urls.append(each.get('href')) #a标签中的链接
"""
获取章节内容
parameters:
target 下载链接-string
returns:
texts 章节内容-string
"""
def get_contens(self, target):
url=self.server+target
req = requests.get(url)
bf = BeautifulSoup(req.text,'lxml')
texts = bf.find_all('div', class_='showtxt')
texts = texts[0].text.replace('\xa0'*8, '\n\n') # 章节中的八个空格替换为回车
return texts
"""
说明:将爬取的文件写入爬虫
parameters:
name-章节名称-string
path-当前路径下,小说保存名-string
text-章节内容-string
retruns:
无
"""
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name+'\n')
f.writelines(text)
f.write('\n\n')
if __name__ == '__main__':
dl = downloader()
dl.get_download_url()
book_name='斗破苍穹.txt'
print('开始下载:')
for i in range(dl.nums):
dl.writer(dl.chapters[i], book_name, dl.get_contens(dl.urls[i]))
sys.stdout.write("已下载:{0:.2%}{1}".format(i/dl.nums,'\r'))
sys.stdout.flush()
print("下载完成")