本文参考:https://blog.csdn.net/hhy1107786871/article/details/88170456
import requests
from lxml import etree
import os
import random
# 设置requests库的重连接次数
requests.adapters.DEFAULT_RETRIES = 5
HOST = 'http://www.xbiquge.la/28/28056/'
user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
# 爬取一本小说
class ScrapyOne(object):
def __init__(self,rootlink):
super(ScrapyOne,self).__init__()
self.rootlink = rootlink
#爬取每章的连接
def scrapyLink(self):
try:
#随机生成请求头(header) 学到了random.choice()
header = {"User-Agent": random.choice(user_agent)}
res = requests.get(self.rootlink,headers=header)
res.encoding = 'utf-8'
#xpath解析html 准备
data = etree.HTML(res.text)
#获取书名
bookname = data.xpath('//*[@id="info"]/h1/text()')[0]
#获取每章的链接
links = []
for link in data.xpath('//div[@id="list"]//dd/a/@href'):
links.append(HOST+link.split('/')[-1])
if links:
return{
'bookname':bookname,
'links':links
}
else:
return{}
except Exception as e:
print(e)
return[]
#爬取每一章的内容
def scrapyText(self,url):
try:
header = {"User-Agent": random.choice(user_agent)}
res = requests.get(url,headers=header)
res.encoding = 'utf-8'
data = etree.HTML(res.text)
#获取章节名
chapter_name = data.xpath('//div[@class="bookname"]/h1/text()')[0]
#爬去的文本中 每一张前都有正文卷 为了简洁 去掉
name = chapter_name.replace('正文卷','')
#获取小说内容
texts = []
#优化文本
for text in data.xpath('//div[@id="content"]/text()'):
text = text.replace('\r\n','').replace('&nbsq','')
if text:
texts.append(text)
if texts:
return {
'name': name,
'texts': texts
}
else:
return False
except Exception as e:
print(e)
return False
# 保存一章
def save(self,bookname,name,texts):
try:
if not os.path.exists('./'+bookname):
os.makedirs('./'+bookname)
with open('./%s/%s.txt'%(bookname,name),'w',encoding='utf-8') as f:
f.write(name+'\n')
for text in texts:
f.write(text+'\n')
f.close()
return True
except Exception as e:
print(e)
return False
# 主函数
def main(self):
try:
# 获取书的章节信息
bookInfo = self.scrapyLink()
i = 0
for link in bookInfo['links']:
# 获取一章的内容
info = self.scrapyText(link)
if info:
if self.save(bookInfo['bookname'],str(i)+'-'+info['name'],info['texts']):
print('存出成功',info['name'])
else:
print('存储失败',info['name'])
i +=1
except Exception as e:
print(e)
if __name__ == "__main__":
one = ScrapyOne(HOST)
one.main()
# 将多个文本合成一个txt。
import os
filedir = os.getcwd()+'/xiaogelao'
filenames=os.listdir(filedir)
f = open('result.txt','w',encoding='utf-8')
for filename in filenames:
filepath = filedir+'/'+filename
for line in open(filepath,encoding='utf-8'):
if line.strip() == '':
continue
f.writelines(line)
f.write('\n')
f.close()
成果展示:
遇到的问题:
1)爬取网站源码的过程还算顺利,但下载速度也太慢了,一本书一千多章,花了几个小时。
2)爬取下来是一章章的多个txt的,需要做一个合成文本的操作。
代码有点长,希望日后进一步学习后能改正。
看着简介小阁老这篇小说看着不错,等哪天手上的轮回乐园不想看了,就看小阁老。