python 多线程 全站小说_多线程下载小说

本文介绍了如何使用Python编写爬虫,通过多线程和Redis来批量下载小说。首先,作者提到爬取一万章小说只需四分钟,然后详细讲解了代码实现,包括如何获取小说目录、存储章节信息到Redis、使用代理IP、错误处理以及存储下载内容。最后,代码示例展示了如何启动多线程下载并处理失败的任务。
摘要由CSDN通过智能技术生成

引言

不知道说啥子, 直接进入正题,哦,本人实测,一本一万多章的小说大概四分钟能爬完,在此想说(小说)网站牛逼,不像学校的教务系统,说多了都是泪

多线程

爬取属于io密集型的,所以引入多线程

redis

用redis分发任务以及数据暂存,所以想用的得去下个redis,嘻嘻嘻,其实之前还用了mongodb,为了减少麻烦,就缩减为redis了

redis windows安装 需要开着redis-server,到了启动redis-server那步即可停止

代理

质量不错就用,免费的去一边(自己试了些免费,不咋行)

代码

'''

安装redis 开启server

采用redis 数据太多好像一次取不出来 试了一个一万多章的,一次只取出了9000多章, 现在分批次取

所以大概需要的值有两个

1. 所需要爬取小说的目录页面的链接

2. 第一章的开头位置, 因为有些可能前六章是最新章节 默认:0

3.有问题再联系,小说网站总体差不多(网页排版)应该能适用几个网站吧吧吧吧

'''

import random

import redis

# import pymongo

import requests

import json

import logging

import threading

from bs4 import BeautifulSoup

from concurrent.futures import ThreadPoolExecutor

# from Proxyer import Proxyer

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

class BookDownload:

def __init__(self, chapter_link, start_place=0, abbr='http://', proxy=None):

self.chapter_link = chapter_link # 目录页面的链接

self.start_place = start_place # 第一章在目录中的位置

self.abbr_link = chapter_link # 前缀 链接的前一部分 做了处理 不需要再传了

self.redis_client = redis.Redis()

self.event = threading.Event()

self.redis_list = 'url_info'

self.redis_failed_list = 'failed_url'

self.redis_cache = 'download'

self._proxy = proxy

# self.mongo_collect = pymongo.MongoClient().chapter_3.test3 # mongodb,自己设置

self.all_chapter = 0

self.successed_download = 0

self.session = requests.session()

self.header = {

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}

@property

def proxy(self):

# 代理质量好就用代理 免费代理不推荐(本机就行) 测试的这个网站不咋封ip

# 要求传递list or None

if self._proxy:

choosed_proxy = random.choice(self._proxy)

return {'http': 'http:' + choosed_proxy}

return None

@proxy.setter

def proxy(self, value):

if isinstance(value, list) or value is None:

self._proxy = value

else:

raise ValueError('must be list type or None')

def get_all_chapter(self):

res = self.session.get(self.chapter_link, headers=self.header, timeout=5)

if res.status_code != 200:

raise Exception("can't access the website")

soup = BeautifulSoup(res.content.decode(), 'lxml')

name_list = soup.find('div', attrs={'id': 'list'})

dl_list = name_list.find('dl')

wanted_download = dl_list.find_all('dd')[self.start_place:]

self.all_chapter = len(wanted_download)

for order, value in enumerate(wanted_download):

yield order, value.a.get('href').rsplit('/')[-1], value.a.text

def store_name_in_redis(self):

"""

直接调用get_all_chapter

:return:

"""

for info in self.get_all_chapter():

try:

self.redis_client.rpush(self.redis_list, json.dumps(info))

except Exception as e:

logging.info(e)

def requests_one_link(self, detail_link, timeout):

"""

直接解析了

:param detail_link:

:return: 正文内容

"""

try:

res = self.session.get(detail_link, proxies=self.proxy, headers=self.header, timeout=timeout)

text = res.content.decode()

soup = BeautifulSoup(text, 'lxml')

zhengwen = soup.find('div', attrs={'id': 'content'}).text.replace(r'
', '\n')

return zhengwen

except Exception as e:

# raise e

return None

def _clear_redis(self):

"""

清除redis

:return:

"""

try:

self.redis_client.delete(self.redis_list)

self.redis_client.delete(self.redis_failed_list)

self.redis_client.delete(self.redis_cache)

# self.redis_client.lpop(self.redis_cache)

except Exception as e:

pass

def init_work(self):

self._clear_redis()

def get_url(self, name):

"""

从redis中获取url信息

:return:

"""

burl_info = self.redis_client.lpop(name)

if burl_info:

url_info = json.loads(burl_info)

order, after_link, name = url_info

return order, after_link, name

return None

def handle(self, order, after_link, name, timeout=2):

"""

成功不管,失败返回信息扔进failed队列

:param order:

:param after_link:

:param name:

:return:

"""

content = self.requests_one_link(self.abbr_link + after_link, timeout)

if content:

keys = name + '\n' + content

self.redis_client.zadd(self.redis_cache, {keys: order})

logging.info('sucess download {}'.format(name))

self.successed_download += 1

# self.mongo_collect.insert_one({'order': order, 'name': name, 'content': content})

# logging.info('sucess download {}'.format(name))

# self.successed_download += 1

return None

else:

logging.info('failed to download {}'.format(name))

return order, after_link, name

def _callback(self, futures):

"""

回调函数

:param futures:

:return:

"""

res = futures.result()

if res:

try:

self.redis_client.rpush(self.redis_failed_list, json.dumps(res))

except Exception as e:

logging.info(e)

def start_download(self, Pool: ThreadPoolExecutor):

while True:

info = self.get_url(self.redis_list)

if info:

futures = Pool.submit(self.handle, *info)

futures.add_done_callback(self._callback)

else:

break

self.event.set()

Pool.shutdown()

def failed_download(self):

"""

对第一次失败的进行下载

最多尝试三次

:return:

"""

if self.event.wait():

while True:

info = self.get_url(self.redis_failed_list)

if info:

try_times = 3

while try_times:

if not self.handle(*info, timeout=3):

break

try_times -= 1

else:

break

logging.info("=============end download==============")

logging.info("===all chapter {}=== success download {}=====".format(self.all_chapter, self.successed_download))

def start_failed_download(self):

thread = threading.Thread(target=self.failed_download)

thread.start()

thread.join()

def store_txt(self):

txt = 'download.txt'

# count = self.redis_client.zcard(self.redis_cache)

while self.redis_client.zcard(self.redis_cache):

content = ''

for x in self.redis_client.zrange(self.redis_cache, 0, 1000):

content += x.decode() + '\n'

with open(txt, 'a+',encoding='utf8') as f:

f.write(content)

self.redis_client.zremrangebyrank(self.redis_cache, 0, 1000)

if __name__ == '__main__':

Pool = ThreadPoolExecutor(15)

bookdownload = BookDownload('http://www.xbiquge.la/54/54101/', 0)

bookdownload.init_work()

bookdownload.store_name_in_redis()

logging.info('=======================start================================')

bookdownload.start_download(Pool)

bookdownload.start_failed_download()

bookdownload.store_txt()

###测试结果

b6ea039770e94e097ac912a669e2a9be.png

5355e244117f2625dce28869db5567de.png

原文链接:https://blog.csdn.net/qq_45667109/article/details/106041255

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值