爬虫小说爬取(待修改)

爬虫

爬虫进一步学习,找到了一份《笔趣说》小说网站的爬取代码。代码亟待需要维护,修正。
但频繁爬取后出现503错误,等待进一步学习解决。

from urllib import request
from bs4 import BeautifulSoup
import collections
import re
import os
import time
import sys
import types

"""
类说明:下载《笔趣看》网小说: url:https://www.biqukan.com/
Parameters:
	target - 《笔趣看》网指定的小说目录地址(string)
Returns:
	无
Modify:
	2017-05-06
"""
# 下载类
# 定义了一个下载类/
class download(object):
	def __init__(self, target):
		self.__target_url = target
		self.__head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19',}

	"""
	函数说明:获取下载链接
	Parameters:
		无
	Returns:
		novel_name + '.txt' - 保存的小说名(string)
		numbers - 章节数(int)
		download_dict - 保存章节名称和下载链接的字典(dict)
	Modify:
		2017-05-06
	"""
	def get_download_url(self):
		charter = re.compile(u'[第弟](.+)章', re.IGNORECASE)
		#url请求
		target_req = request.Request(url = self.__target_url, headers = self.__head)
		#回应
		target_response = request.urlopen(target_req)
		#html文档
		target_html = target_response.read().decode('gbk','ignore')
		#bs4,目标文档,解析器。
		listmain_soup = BeautifulSoup(target_html,'lxml')
		#找到listmian
		chapters = listmain_soup.find_all('div',class_ = 'listmain')
		#在创建一个bs4
		download_soup = BeautifulSoup(str(chapters), 'lxml')

		# ['<dt>《一念永恒', '最新章节列表</dt>'] 如分解出来如下。
		# print(str(download_soup.dl.dt).split("》")[0][5:],str(download_soup.dl.dt).split("》"))
		
		novel_name = str(download_soup.dl.dt).split("》")[0][5:]
		flag_name = "《" + novel_name + "》" + "正文卷"

		download_dict = collections.OrderedDict()
		begin_flag = False
		numbers = 0
		for child in download_soup.dl.children:
			if child != '\n':
				if child.string == u"%s" % flag_name:
					begin_flag = True
				if begin_flag == True and child.a != None:
					download_url = "https://www.biqukan.com" + child.a.get('href')
					download_name = child.string
					download_dict[download_name] = download_url
					numbers += 1
		return novel_name + '.txt', numbers, download_dict
	
	"""
	函数说明:爬取文章内容
	Parameters:
		url - 下载连接(string)
	Returns:
		soup_text - 章节内容(string)
	Modify:
		2017-05-06
	"""
	def Downloader(self, url):
		download_req = request.Request(url = url, headers = self.__head)
		download_response = request.urlopen(download_req)
		print(download_response)
		download_html = download_response.read().decode('gbk','ignore')
		soup_texts = BeautifulSoup(download_html, 'lxml')
		texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
		soup_text = BeautifulSoup(str(texts), 'lxml').div.text.replace('\xa0','')
		return soup_text

	"""
	函数说明:将爬取的文章内容写入文件
	Parameters:
		name - 章节名称(string)
		path - 当前路径下,小说保存名称(string)
		text - 章节内容(string)
	Returns:
		无
	Modify:
		2017-05-06
	"""
	def Writer(self, name, path, text):
		write_flag = True
		with open(path, 'a', encoding='utf-8') as f:
			f.write(name + '\n\n')
			for each in text:
				if each == 'h':
					write_flag = False
				if write_flag == True and each != ' ':
					f.write(each)
				if write_flag == True and each == '\r':
					f.write('\n')			
			f.write('\n\n')

if __name__ == "__main__":
	
	#小说地址
	target_url = str(input("请输入小说目录下载地址:\n"))

	#实例化下载类
	d = download(target = target_url)
	name, numbers, url_dict = d.get_download_url()
	if name in os.listdir():
		os.remove(name)
	index = 1

	#下载中
	print("《%s》下载中:" % name[:-4])
	for key, value in url_dict.items():
		print(key,value)
		d.Writer(key, name, d.Downloader(value))
		sys.stdout.write("已下载:%.3f%%" %  float(index/numbers) + '\r')
		sys.stdout.flush()
		index += 1	

	print("《%s》下载完成!" % name[:-4])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值