爬取小说

#!/usr/bin/env python
# -*-coding: utf-8-*-
# author: editor time 2018/2/1

import os

import re
import requests
from lxml import etree

url = 'http://www.biquge.com.tw'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}


# 第一步,获取小说的分类、建立分类的文件夹
def novelType():
    req = requests.get(url,headers=header)
    req.encoding = 'gbk'
    htm = req.text
    # print(htm)
    # lxml必备步骤
    html = etree.HTML(htm)
    dic = {}
    book = html.xpath('//li/a/text()')  #小说分类
    link = html.xpath('//li/a/@href')   #小说分类链接
    for i in range(9):
        dic[book[i]] = link[i]  #将小说分类和链接以字典的形式存储
    return dic
    # 正则表达式
    # rex=re.findall('<li><a href="(.*?)">(.*?)</a></li>',htm,re.S)
    # for i in rex:
    #     print(i[0],i[1])


# 第二步,获取分类中的小说名称和链接,建立小说文件夹
def novel():
    biquge = {}
    for typeName in novelType():    #遍历小说类型和链接的字典
        dic = {}
        req = requests.get(url + novelType()[typeName],headers=header)  #请求小说分类的链接
        req.encoding = 'gbk'
        html = etree.HTML(req.text)
        novelLink = html.xpath("//div[@id='newscontent']/div[2]/ul/li/span[1]/a/@href")     #小说名称的链接
        novelName = html.xpath("//div[@id='newscontent']/div[2]/ul/li/span[1]/a/text()")    #小说名
        if (len(novelName) > 2):    #筛选出分类下有小说名称的
            # print(novelName)
            for i in range(len(novelName)):
                dic[novelLink[i]] = novelName[i]    #将小说名和小说对应链接存储进字典
        biquge[typeName] = dic  #将小说的分类作为键,小说名称和链接作为值存入字典
    return biquge


# 第三步,获取小说的章节链接
def chapter():
    biquge = {}
    for typeName, novelInfo in novel().items():     #遍历小说类型和小说名称和链接组成的字典
        novelTxt = {}
        if (len(novelInfo)):
            for novelLink, novelName in novelInfo.items():  #遍历小说链接
                chapterTxt = {}
                req = requests.get(novelLink,headers=header)    #请求小说的链接
                req.encoding = 'gbk'
                ref = re.findall(r'<dd><a href="(.*?)">(.*?)</a></dd>', req.text, re.S)
                for i in range(len(ref)):
                    chapterTxt[ref[i][1]] = ref[i][0]   #将小说章节的链接和名称对应
                # print(typeName,novelName)
                novelTxt[novelName] = chapterTxt        #将小说名和章节对应
        biquge[typeName] = novelTxt                     #将小说类型和小说对应
    # print(biquge)
    return biquge


# 第四步 第一种方法  获取每一节,每节一个文本
def novelSpider1():
    for typeName, novelInfo in chapter().items():
        try:
            os.mkdir('E:/document/novel/{}'.format(typeName))   #建立小说类型的文件夹
        except:
            pass
        typePath='E:/document/novel/{}'.format(typeName)
        for novelName, chapterInfo in novelInfo.items():
            try:
                os.mkdir(typePath+'/'+novelName)                #建立小说文件夹
            except:
                pass
            novelPath=typePath+'/'+novelName
            for chapterName,chapterLink in chapterInfo.items(): 
                # print(chapterLink)
                req = requests.get(url+chapterLink,headers=header)  #请求小说章节链接
                req.encoding = 'gbk'
                htm = etree.HTML(req.text)
                content = htm.xpath('//div[@id="content"]/text()')  #小说章节内容
                for i in range(len(content)):
                    content[i] = content[i].replace(u'\xa0', '')    #内容初步清洗
                text = (''.join(content))
                with open(novelPath+'/'+chapterName+'.txt', 'w', encoding='utf-8') as f:    
                    f.write(text)   #写入文件


# 第四步 第二种方法  获取每本小说一个文本
def novelSpider2():
    for typeName, novelInfo in chapter().items():
        try:
            os.mkdir('E:/document/novel/{}'.format(typeName))
        except:
            pass
        typePath = 'E:/document/novel/{}'.format(typeName)
        for novelName, chapterInfo in novelInfo.items():
            novelPath = typePath + '/' + novelName+'.txt'
            for chapterName, chapterLink in chapterInfo.items():
                # print(chapterLink)
                req = requests.get(url + chapterLink, headers=header)
                req.encoding = 'gbk'
                htm = etree.HTML(req.text)
                content = htm.xpath('//div[@id="content"]/text()')
                for i in range(len(content)):
                    content[i] = content[i].replace(u'\xa0', '')
                text = (''.join(content))
                with open(novelPath, 'a+', encoding='utf-8') as f:
                    f.write(chapterName)
                    f.write(text)
                    f.write('\n')
                    
# 爬取时只用一个第四步方法即可
if __name__=='__main__':
    # novelSpider1()
    novelSpider2()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值