我的爬虫入门作(二)


1. 使用线程池

增加代码:

with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

使用map()方法开启多线程执行并发任务,最大线程开启数限制为10个。线程任务执行完之后,结果按参数列表的顺序保存,所以章节的顺序并不会乱。map()方法第一个参数是线程执行的方法,第二个是方法的参数。建议线程数不要设置太多,这样对服务器友好。

2. 完整代码

from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
import time
import sys
import re
import threading


class fiction():

    __chapter_download = 0   # 已下载章节数
    __chapter_total = 0  # 章节总数

    def __init__(self, name, url_ws, url_lp, encode, attrs_div_lp={}, attrs_div_ct={}):
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode  # 指定编码格式

    def Update(self, name, url_ws, url_lp, encode, attrs_div_lp={}, attrs_div_ct={}):
        '''重置参数

        必须同时重置所有参数,否则可能出现错误

        '''
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode

    def __get_Link_chapter(self):
        '''在目录页获得各个章节的url.

        解析目录页,通过属性找到存放各个章节url的div标签,
        获取各个章节的url并且返回

        '''

        # 当请求发生异常:连接或者超时错误,等待1S再尝试
        for try_counter in range(10):
            try:
                req_lp = requests.get(self.__url_lp, timeout=10)
                break
            except ConnectionError:
                print('尝试获取目录页ConnectionError:%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取目录页TimeoutError:%d' % (try_counter+1))
            except:
                print('尝试获取目录页OtherError:%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取目录页失败')
            return
        else:
            try:
                req_lp.encoding = req_lp.apparent_encoding
                # 建立BeautifulSoup对象,指定解析器lxml
                bs_lp = BeautifulSoup(req_lp.text, 'lxml')
                # 找到所有对应属性的div标签
                div_list = bs_lp.find_all('div', attrs=self.__attrs_div_lp)
                # 找到所有的a标签
                link_chapter = []
                for div in div_list:
                    link_chapter += div.find_all('a')
                return link_chapter
            except TypeError:
                print('目录页解析异常:TypeError')
                return
            # except:
            #     print('目录页解析异常:OtherError')
            #     return

    def __get_content_chapter(self, link):
        '''获取章节内容.

        :param link:在目录页解析后得到的a标签
                    内含章节名和url

        '''
        
        name_chapter = link.string
        url_chapter = self.__url_ws + link['href']  # 拼接得到章节页url
        for try_counter in range(10):
            try:
                req_ct = requests.get(url_chapter, timeout=10)
                break
            except ConnectionError:
                print('尝试获取章节链接:ConnectionError%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取章节链接:TimeoutError%d' % (try_counter+1))
            except:
                print('尝试获取章节链接:OtherError%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取链接失败:'+name_chapter)
            content_chapter = name_chapter+'\n\n'
        else:
            try:
                req_ct.encoding = self.__encode
                bs_ct = BeautifulSoup(
                    req_ct.text, 'lxml')
                content = bs_ct.find(
                    'div', attrs=self.__attrs_div_ct)
                content = str(content).replace('<br/>','\n').replace('\xa0',' ')
                content = BeautifulSoup(content,'lxml').get_text()
                content_chapter = name_chapter + '\n\n' + content + '\n\n'
            except TypeError:
                print('章节页解析异常:TypeError '+name_chapter)
                content_chapter = name_chapter+'\n\n'
            except:
                print('章节页解析异常:OtherError '+name_chapter)
                content_chapter = name_chapter+'\n\n'

        self.__chapter_download += 1    # 计算章节下载数
        sys.stdout.write('下载进度:%.1f%%' % float(
                            self.__chapter_download/self.__chapter_total*100)+'\r')
        return content_chapter

    def write(self, path_save):
        '''写下载的文件到指定路径.

        :param path_save:指定的保存路径

        '''
        path_save = path_save + '\\' + self.__name + '.txt'
        link_chapter = self.__get_Link_chapter()
        self.__chapter_total = len(link_chapter)
        if link_chapter is None:
            pass
        else:
            # 开线程池
            with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

            with open(path_save, 'w+', encoding=self.__encode) as file:
                for content_chapter in content_chapter_list:
                    file.write(content_chapter)
        print('<<'+self.__name+'>>下载完成')


if __name__ == '__main__':
    start = time.time()
    f = fiction(name='雪中悍刀行',
                url_ws='http://www.xbiquge.la',
                url_lp='http://www.xbiquge.la/0/745/',
                attrs_div_lp={'id': 'list'},
                attrs_div_ct={'id': 'content'},
                encode='utf-8')
    f.write(r'C:\Users\HP\Desktop\pytxt')
    stop = time.time()
    print('用时:%ds' % (stop-start))

3. 参考

  1. python给爬虫加速:多线程,多进程
  2. Python 线程池
  3. Python官方文档——启动并行任务
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值