对于多线程的处理,我们先考虑脚本自身,所有的抓取都是基于categoryid的,所以我们可以考虑做一个任务队列,生产者负责把id推送到队列(Queue.put),消费者负责从队列读取id,来进行文件的读取和下载。(Queue.get())
这样,我们就要改造之前的脚本,先定义一个队列Q_example=Queue() ,在categoryId函数中,把获取到的id都put到Q_example中。然后再改造download()函数,去掉循环逐个获取,改为把id推送过来。最后的实现代码如下:
# coding:utf-8
"""
author:@
"""
import requests
import urllib.request as ur
from lxml import etree
import os
from lxml import etree
import threading
from queue import Queue
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
"获取 文档iD"
Q_example=Queue()
def categoryId():
"获取文档对应的id"
document_page=requests.get("http://help.tongtool.com/category/buildCategoryTreeHavingChildren")
ducument_result = document_page.json()
erp_help=ducument_result.get('datas')[0].get('childrenList')[0] #get('childrenList')[0]为ERP [1]为listing []
"得到帮助文档内容"
help_infos=erp_help.get('childrenList')[0].get('childrenList')
categoryIds=[]
for i in help_infos:
"获取一级菜单下面的子信息"
sub_info=i.get('childrenList') #list
for j in sub_info:
categoryIds.append(j.get('id'))
Q_example.put(j.get('id'))
return categoryIds
def consumer(id):
# id=Q_example.get()
page = requests.get(
"http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),
headers=headers)
result = page.json()
try:
contents = result.get('datas').get('list')
except:
print (page.url)
for content in contents:
"存在多个的时候逐个读取"
title = content.get('title')
if not os.path.exists('d://tmp//{0}'.format(title)):
try:
os.makedirs('d://tmp//{0}'.format(title))
except OSError:
'title名称异常,截取'
title = title.split(":")[0] # 存在一个报错的地方,先这么处理
os.makedirs('d://tmp//{0}'.format(title))
content_text = content.get('content')
html = etree.HTML(content_text)
"解析文本内容"
html_contents = html.xpath("//span/text()")
file_text = '{0}.txt'.format(title)
all_contents = ""
for html_content in html_contents:
all_contents = all_contents + html_content + '\n'
file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
file.write(all_contents)
"解析帮助图片"
html_pages = html.xpath("//img/@src")
"创建文件夹来存储图片"
for page in html_pages:
filename = page.split('/')[-1]
print('准备下载:' + page)
if 'help' not in page:
print('过滤链接:' + page)
continue
if "http" not in page:
page = "http:" + page
ur.urlretrieve(page, os.path.join("d://tmp", title, filename))
def download():
categoryIds=categoryId()
for id in categoryIds:
page=requests.get("http://help.tongtool.com/docs/listDocsPage?categoryId={0}&pageSize=20&pageNum=1".format(id),headers=headers)
result=page.json()
contents=result.get('datas').get('list')
for content in contents:
"存在多个的时候逐个读取"
title = content.get('title')
if not os.path.exists('d://tmp//{0}'.format(title)):
try:
os.makedirs('d://tmp//{0}'.format(title))
except OSError:
'title名称异常,截取'
title=title.split(":")[0] #存在一个报错的地方,先这么处理
os.makedirs('d://tmp//{0}'.format(title))
content_text = content.get('content')
html = etree.HTML(content_text)
"解析文本内容"
html_contents = html.xpath("//span/text()")
file_text = '{0}.txt'.format(title)
all_contents = ""
for html_content in html_contents:
all_contents = all_contents + html_content + '\n'
file = open('d:\\tmp\\{0}'.format(file_text), 'w', encoding='utf-8')
file.write(all_contents)
"解析帮助图片"
html_pages=html.xpath("//img/@src")
"创建文件夹来存储图片"
for page in html_pages:
filename = page.split('/')[-1]
print ('准备下载:'+page)
if 'help' not in page:
print ('过滤链接:'+page)
continue
if "http" not in page:
page="http:"+page
ur.urlretrieve( page, os.path.join("d://tmp",title,filename))
if __name__ == '__main__':
import time
start_time=time.strftime("%H:%M:%S",time.localtime())
print (time.strftime("%H:%m:%S",time.localtime()))
ids=categoryId()
print (dir(Q_example))
print (Q_example.qsize())
while Q_example.qsize()!=0:
print('length is ',Q_example.qsize())
threads=[]
q_length=Q_example.qsize()
if q_length>10:
q_i=10
else:
q_i=q_length
for i in range(q_i):
value=Q_example.get()
print ('value is ',value)
t = threading.Thread(target=consumer, args=(value,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
end_time = time.strftime("%H:%M:%S", time.localtime())
print ('start time : ',start_time)
print ('end time : ',end_time)
两者的运行时间结果对比如下:
多线程:start time : 10:50:03 end time : 10:50:28 共计25S
单线程:start time : 10:36:49 end time : 10:37:56 共计67S
其中代码的这部分,最开始的写法是直接for i in range(10),后来运行中,代码始终不会结束。后来发现是最后一次的时候,当队列的长度小于10,导致get不到数据,始终停在那里了。
if q_length>10:
q_i=10
else:
q_i=q_length
for i in range(q_i):
value=Q_example.get()