#!/usr/bin/env python # -*-coding: utf-8-*- # author: editor time 2018/2/1 import os import re import requests from lxml import etree url = 'http://www.biquge.com.tw' header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } # 第一步,获取小说的分类、建立分类的文件夹 def novelType(): req = requests.get(url,headers=header) req.encoding = 'gbk' htm = req.text # print(htm) # lxml必备步骤 html = etree.HTML(htm) dic = {} book = html.xpath('//li/a/text()') #小说分类 link = html.xpath('//li/a/@href') #小说分类链接 for i in range(9): dic[book[i]] = link[i] #将小说分类和链接以字典的形式存储 return dic # 正则表达式 # rex=re.findall('<li><a href="(.*?)">(.*?)</a></li>',htm,re.S) # for i in rex: # print(i[0],i[1]) # 第二步,获取分类中的小说名称和链接,建立小说文件夹 def novel(): biquge = {} for typeName in novelType(): #遍历小说类型和链接的字典 dic = {} req = requests.get(url + novelType()[typeName],headers=header) #请求小说分类的链接 req.encoding = 'gbk' html = etree.HTML(req.text) novelLink = html.xpath("//div[@id='newscontent']/div[2]/ul/li/span[1]/a/@href") #小说名称的链接 novelName = html.xpath("//div[@id='newscontent']/div[2]/ul/li/span[1]/a/text()") #小说名 if (len(novelName) > 2): #筛选出分类下有小说名称的 # print(novelName) for i in range(len(novelName)): dic[novelLink[i]] = novelName[i] #将小说名和小说对应链接存储进字典 biquge[typeName] = dic #将小说的分类作为键,小说名称和链接作为值存入字典 return biquge # 第三步,获取小说的章节链接 def chapter(): biquge = {} for typeName, novelInfo in novel().items(): #遍历小说类型和小说名称和链接组成的字典 novelTxt = {} if (len(novelInfo)): for novelLink, novelName in novelInfo.items(): #遍历小说链接 chapterTxt = {} req = requests.get(novelLink,headers=header) #请求小说的链接 req.encoding = 'gbk' ref = re.findall(r'<dd><a href="(.*?)">(.*?)</a></dd>', req.text, re.S) for i in range(len(ref)): chapterTxt[ref[i][1]] = ref[i][0] #将小说章节的链接和名称对应 # print(typeName,novelName) novelTxt[novelName] = chapterTxt #将小说名和章节对应 biquge[typeName] = novelTxt #将小说类型和小说对应 # print(biquge) return biquge # 第四步 第一种方法 获取每一节,每节一个文本 def novelSpider1(): for typeName, novelInfo in chapter().items(): try: os.mkdir('E:/document/novel/{}'.format(typeName)) #建立小说类型的文件夹 except: pass typePath='E:/document/novel/{}'.format(typeName) for novelName, chapterInfo in novelInfo.items(): try: os.mkdir(typePath+'/'+novelName) #建立小说文件夹 except: pass novelPath=typePath+'/'+novelName for chapterName,chapterLink in chapterInfo.items(): # print(chapterLink) req = requests.get(url+chapterLink,headers=header) #请求小说章节链接 req.encoding = 'gbk' htm = etree.HTML(req.text) content = htm.xpath('//div[@id="content"]/text()') #小说章节内容 for i in range(len(content)): content[i] = content[i].replace(u'\xa0', '') #内容初步清洗 text = (''.join(content)) with open(novelPath+'/'+chapterName+'.txt', 'w', encoding='utf-8') as f: f.write(text) #写入文件 # 第四步 第二种方法 获取每本小说一个文本 def novelSpider2(): for typeName, novelInfo in chapter().items(): try: os.mkdir('E:/document/novel/{}'.format(typeName)) except: pass typePath = 'E:/document/novel/{}'.format(typeName) for novelName, chapterInfo in novelInfo.items(): novelPath = typePath + '/' + novelName+'.txt' for chapterName, chapterLink in chapterInfo.items(): # print(chapterLink) req = requests.get(url + chapterLink, headers=header) req.encoding = 'gbk' htm = etree.HTML(req.text) content = htm.xpath('//div[@id="content"]/text()') for i in range(len(content)): content[i] = content[i].replace(u'\xa0', '') text = (''.join(content)) with open(novelPath, 'a+', encoding='utf-8') as f: f.write(chapterName) f.write(text) f.write('\n') # 爬取时只用一个第四步方法即可 if __name__=='__main__': # novelSpider1() novelSpider2()
爬取小说
最新推荐文章于 2024-08-07 09:00:00 发布