用python爬取某本小说

本文参考:https://blog.csdn.net/hhy1107786871/article/details/88170456

import requests
from lxml import etree
import os
import random
# 设置requests库的重连接次数
requests.adapters.DEFAULT_RETRIES = 5
HOST = 'http://www.xbiquge.la/28/28056/'

user_agent = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

# 爬取一本小说
class ScrapyOne(object):
    def __init__(self,rootlink):
        super(ScrapyOne,self).__init__()
        self.rootlink = rootlink
    #爬取每章的连接
    def scrapyLink(self):
        try:
            #随机生成请求头(header) 学到了random.choice()
            header = {"User-Agent": random.choice(user_agent)}
            res = requests.get(self.rootlink,headers=header)
            res.encoding = 'utf-8'
            #xpath解析html 准备
            data = etree.HTML(res.text)
			#获取书名
            bookname = data.xpath('//*[@id="info"]/h1/text()')[0]
            #获取每章的链接
            links = []
            for link in data.xpath('//div[@id="list"]//dd/a/@href'):
                links.append(HOST+link.split('/')[-1])
            if links:
                return{
                    'bookname':bookname,
                    'links':links
                }
            else:
                return{}
        except Exception as e:
            print(e)
            return[]
    #爬取每一章的内容
    def scrapyText(self,url):
        try:
            header = {"User-Agent": random.choice(user_agent)}
            res = requests.get(url,headers=header)
            res.encoding = 'utf-8'
            data = etree.HTML(res.text)
            #获取章节名
            chapter_name = data.xpath('//div[@class="bookname"]/h1/text()')[0]
            #爬去的文本中 每一张前都有正文卷 为了简洁 去掉
            name = chapter_name.replace('正文卷','')
            #获取小说内容
            texts = []
            #优化文本
            for text in data.xpath('//div[@id="content"]/text()'):
		    	text = text.replace('\r\n','').replace('&nbsq','')
                if text:
                    texts.append(text)
            if texts:
                return {
                    'name': name,
                    'texts': texts
                }
            else:
                return False
        except Exception as e:
            print(e)
            return False
    # 保存一章
    def save(self,bookname,name,texts):
        try:
            if not os.path.exists('./'+bookname):
                os.makedirs('./'+bookname)
            with open('./%s/%s.txt'%(bookname,name),'w',encoding='utf-8') as f:
                f.write(name+'\n')
                for text in texts:
                    f.write(text+'\n')
            f.close()
            return True
        except Exception as e:
            print(e)
            return False
    # 主函数
    def main(self):
        try:
            # 获取书的章节信息
            bookInfo = self.scrapyLink()
            i = 0
            for link in bookInfo['links']:
                # 获取一章的内容
                info = self.scrapyText(link)
                if info:
                    if self.save(bookInfo['bookname'],str(i)+'-'+info['name'],info['texts']):
                        print('存出成功',info['name'])
                    else:
                        print('存储失败',info['name'])
                    i +=1
        except Exception as e:
            print(e)
if __name__ == "__main__":
    one = ScrapyOne(HOST)
    one.main()
# 将多个文本合成一个txt。    
import os
filedir = os.getcwd()+'/xiaogelao'
filenames=os.listdir(filedir)
f = open('result.txt','w',encoding='utf-8')
for filename in filenames:
    filepath = filedir+'/'+filename
    for line in open(filepath,encoding='utf-8'):
        if line.strip() == '':
            continue
        f.writelines(line)
        f.write('\n')
f.close()

成果展示:

在这里插入图片描述

遇到的问题:

1)爬取网站源码的过程还算顺利,但下载速度也太慢了,一本书一千多章,花了几个小时。
2)爬取下来是一章章的多个txt的,需要做一个合成文本的操作。

代码有点长,希望日后进一步学习后能改正。
看着简介小阁老这篇小说看着不错,等哪天手上的轮回乐园不想看了,就看小阁老。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值