python 使用lxml requests抓取某网站的帮助文档-----多线程处理

最新推荐文章于 2022-11-12 17:24:57 发布

lion_zhou

最新推荐文章于 2022-11-12 17:24:57 发布

阅读量234

点赞数

分类专栏： python爬虫

本文链接：https://blog.csdn.net/zhouxuan623/article/details/103454305

版权

python爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

对于多线程的处理，我们先考虑脚本自身，所有的抓取都是基于categoryid的，所以我们可以考虑做一个任务队列，生产者负责把id推送到队列（Queue.put），消费者负责从队列读取id,来进行文件的读取和下载。（Queue.get()）

这样，我们就要改造之前的脚本，先定义一个队列Q_example=Queue() ,在categoryId函数中，把获取到的id都put到Q_example中。然后再改造download()函数,去掉循环逐个获取，改为把id推送过来。最后的实现代码如下：

# coding:utf-8
"""
author:@
"""
import requests
import urllib.request as ur
from lxml import etree
import  os
from lxml import etree
import threading
from queue import Queue
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
"获取 文档iD"
Q_example=Queue()
def categoryId():
    "获取文档对应的id"
    document_page=requests.get("http://help.tongtool.com/category/buildCategoryTreeHavingChildren")
    ducument_result = document_page.json()
    erp_help=ducument_result.get('datas')[0].get('childrenList')[0]  #get('childrenList')[0]为ERP [1]为listing []

    "得到帮助文档内容"
    help_infos=erp_help.get('childrenList')[0].get('childrenList')
    categoryIds=[]
    for i in help_infos:
        "获取一级菜单下面的子信息"
        sub_info=i.get('childrenList')  #list
        for j in sub_info:
            categoryIds.append(j.get('id'))
            Q_example.put(j.get('id'))
    return  categoryIds
def consumer(id):
    # id=Q_example.get()
    page = requests.get(
        "http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),
        headers=headers)
    result = page.json()
    try:
        contents = result.get('datas').get('list')
    except:
        print (page.url)
    for content in contents:
        "存在多个的时候逐个读取"
        title = content.get('title')
        if not os.path.exists('d://tmp//{0}'.format(title)):
            try:
                os.makedirs('d://tmp//{0}'.format(title))
            except OSError:
                'title名称异常，截取'
                title = title.split("：")[0]  # 存在一个报错的地方，先这么处理
                os.makedirs('d://tmp//{0}'.format(title))

        content_text = content.get('content')
        html = etree.HTML(content_text)
        "解析文本内容"
        html_contents = html.xpath("//span/text()")
        file_text = '{0}.txt'.format(title)
        all_contents = ""
        for html_content in html_contents:
            all_contents = all_contents + html_content + '\n'
        file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
        file.write(all_contents)
        "解析帮助图片"
        html_pages = html.xpath("//img/@src")
        "创建文件夹来存储图片"
        for page in html_pages:
            filename = page.split('/')[-1]
            print('准备下载：' + page)
            if 'help' not in page:
                print('过滤链接：' + page)
                continue
            if "http" not in page:
                page = "http:" + page
            ur.urlretrieve(page, os.path.join("d://tmp", title, filename))

def download():
    categoryIds=categoryId()
    for id in categoryIds:
        page=requests.get("http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),headers=headers)
        result=page.json()
        contents=result.get('datas').get('list')
        for content in contents:
            "存在多个的时候逐个读取"
            title = content.get('title')
            if not os.path.exists('d://tmp//{0}'.format(title)):
                try:
                    os.makedirs('d://tmp//{0}'.format(title))
                except OSError:
                    'title名称异常，截取'
                    title=title.split("：")[0]  #存在一个报错的地方，先这么处理
                    os.makedirs('d://tmp//{0}'.format(title))

            content_text = content.get('content')
            html = etree.HTML(content_text)
            "解析文本内容"
            html_contents = html.xpath("//span/text()")
            file_text = '{0}.txt'.format(title)
            all_contents = ""
            for html_content in html_contents:
                all_contents = all_contents + html_content + '\n'
            file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
            file.write(all_contents)
            "解析帮助图片"
            html_pages=html.xpath("//img/@src")
            "创建文件夹来存储图片"
            for page in html_pages:
                filename = page.split('/')[-1]
                print ('准备下载：'+page)
                if 'help' not in page:
                    print ('过滤链接：'+page)
                    continue
                if "http" not in page:
                    page="http:"+page
                ur.urlretrieve( page, os.path.join("d://tmp",title,filename))

if __name__ == '__main__':
    import time
    start_time=time.strftime("%H:%M:%S",time.localtime())
    print (time.strftime("%H:%m:%S",time.localtime()))
    ids=categoryId()
    print (dir(Q_example))
    print (Q_example.qsize())
    while Q_example.qsize()!=0:
        print('length is ',Q_example.qsize())
        threads=[]
        q_length=Q_example.qsize()
        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()
            print ('value is ',value)
            t = threading.Thread(target=consumer, args=(value,))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print ('start time : ',start_time)
    print ('end time : ',end_time)

两者的运行时间结果对比如下：

多线程：start time : 10:50:03 end time : 10:50:28 共计25S

单线程：start time : 10:36:49 end time : 10:37:56 共计67S

其中代码的这部分，最开始的写法是直接for i in range(10),后来运行中，代码始终不会结束。后来发现是最后一次的时候，当队列的长度小于10，导致get不到数据，始终停在那里了。

        if q_length>10:
            q_i=10
        else:
            q_i=q_length
        for i in range(q_i):
            value=Q_example.get()

lion_zhou

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录