python 使用lxml requests抓取某网站的帮助文档-----多线程处理

对于多线程的处理,我们先考虑脚本自身,所有的抓取都是基于categoryid的,所以我们可以考虑做一个任务队列,生产者负责把id推送到队列(Queue.put),消费者负责从队列读取id,来进行文件的读取和下载。(Queue.get())

这样,我们就要改造之前的脚本,先定义一个队列Q_example=Queue() ,在categoryId函数中,把获取到的id都put到Q_example中。然后再改造download()函数,去掉循环逐个获取,改为把id推送过来。最后的实现代码如下:

# coding:utf-8
"""
author:@
"""
import requests
import urllib.request as ur
from lxml import etree
import  os
from lxml import etree
import threading
from queue import Queue
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
"获取 文档iD"
Q_example=Queue()
def categoryId():
    "获取文档对应的id"
    document_page=requests.get("http://help.tongtool.com/category/buildCategoryTreeHavingChildren")
    ducument_result = document_page.json()
    erp_help=ducument_result.get('datas')[0].get('childrenList')[0]  #get('childrenList')[0]为ERP [1]为listing []

    "得到帮助文档内容"
    help_infos=erp_help.get('childrenList')[0].get('childrenList')
    categoryIds=[]
    for i in help_infos:
        "获取一级菜单下面的子信息"
        sub_info=i.get('childrenList')  #list
        for j in sub_info:
            categoryIds.append(j.get('id'))
            Q_example.put(j.get('id'))
    return  categoryIds
def consumer(id):
    # id=Q_example.get()
    page = requests.get(
        "http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),
        headers=headers)
    result = page.json()
    try:
        contents = result.get('datas').get('list')
    except:
        print (page.url)
    for content in contents:
        "存在多个的时候逐个读取"
        title = content.get('title')
        if not os.path.exists('d://tmp//{0}'.format(title)):
            try:
                os.makedirs('d://tmp//{0}'.format(title))
            except OSError:
                'title名称异常,截取'
                title = title.split(":")[0]  # 存在一个报错的地方,先这么处理
                os.makedirs('d://tmp//{0}'.format(title))

        content_text = content.get('content')
        html = etree.HTML(content_text)
        "解析文本内容"
        html_contents = html.xpath("//span/text()")
        file_text = '{0}.txt'.format(title)
        all_contents = ""
        for html_content in html_contents:
            all_contents = all_contents + html_content + '\n'
        file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
        file.write(all_contents)
        "解析帮助图片"
        html_pages = html.xpath("//img/@src")
        "创建文件夹来存储图片"
        for page in html_pages:
            filename = page.split('/')[-1]
            print('准备下载:' + page)
            if 'help' not in page:
                print('过滤链接:' + page)
                continue
            if "http" not in page:
                page = "http:" + page
            ur.urlretrieve(page, os.path.join("d://tmp", title, filename))

def download():
    categoryIds=categoryId()
    for id in categoryIds:
        page=requests.get("http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),headers=headers)
        result=page.json()
        contents=result.get('datas').get('list')
        for content in contents:
            "存在多个的时候逐个读取"
            title = content.get('title')
            if not os.path.exists('d://tmp//{0}'.format(title)):
                try:
                    os.makedirs('d://tmp//{0}'.format(title))
                except OSError:
                    'title名称异常,截取'
                    title=title.split(":")[0]  #存在一个报错的地方,先这么处理
                    os.makedirs('d://tmp//{0}'.format(title))

            content_text = content.get('content')
            html = etree.HTML(content_text)
            "解析文本内容"
            html_contents = html.xpath("//span/text()")
            file_text = '{0}.txt'.format(title)
            all_contents = ""
            for html_content in html_contents:
                all_contents = all_contents + html_content + '\n'
            file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
            file.write(all_contents)
            "解析帮助图片"
            html_pages=html.xpath("//img/@src")
            "创建文件夹来存储图片"
            for page in html_pages:
                filename = page.split('/')[-1]
                print ('准备下载:'+page)
                if 'help' not in page:
                    print ('过滤链接:'+page)
                    continue
                if "http" not in page:
                    page="http:"+page
                ur.urlretrieve( page, os.path.join("d://tmp",title,filename))

if __name__ == '__main__':
    import time
    start_time=time.strftime("%H:%M:%S",time.localtime())
    print (time.strftime("%H:%m:%S",time.localtime()))
    ids=categoryId()
    print (dir(Q_example))
    print (Q_example.qsize())
    while Q_example.qsize()!=0:
        print('length is ',Q_example.qsize())
        threads=[]
        q_length=Q_example.qsize()
        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()
            print ('value is ',value)
            t = threading.Thread(target=consumer, args=(value,))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print ('start time : ',start_time)
    print ('end time : ',end_time)

两者的运行时间结果对比如下:

多线程:start time :  10:50:03  end time :  10:50:28  共计25S

单线程:start time :  10:36:49 end time :  10:37:56 共计67S

其中代码的这部分,最开始的写法是直接for i in range(10),后来运行中,代码始终不会结束。后来发现是最后一次的时候,当队列的长度小于10,导致get不到数据,始终停在那里了。

        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值