第1关:多线程、多进程爬虫
import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from multiprocessing import Pool
import os
import threading
import psutil
# URL伪装
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
}
def downimg(img_src):
start_time = time.time()
name = img_src.split('/')[-1].split('.')[0]
img_url = "http://127.0.0.1:8080" + img_src
img = requests.get(img_url)
dir_path = 'step1/images'
if not os.path.exists(dir_path):
os.makedirs(dir_path)
img_path = dir_path + '/' + name + '.jpg' # 图片的最终存储路径
print(img_url, name + '.jpg', '开始下载。。。')
thread = threading.currentThread()
process = psutil.Process(os.getpid())
print("线程ID:%s, 进程ID:%s"
% (thread.ident, process.pid))
#********** Begin *********#
"""保存图片"""
with open(img_path, 'wb')as file:
file.write(img.content)
#********** End *********#
finisTime = time.time() - start_time
def parsePage():
url = "http://127.0.0.1:8080/imgs/"
response = requests.get(url=url, headers=header)
html_content = response.text
#********** Begin *********#
"""解析网页"""
html = etree.HTML(html_content)
item_list = html.xpath("//div[@class='box']/div/a/img/@src")
print(item_list)
s_time = time.time()
#********** End *********#
"""非线程操作"""
# for item in item_list:
# downimg(item)
#********** Begin *********#
"""线程操作方式"""
thread = []
for item in item_list:
thread.append(threading.Thread(target=downimg, args=(item, )))
for t in thread:
t.start()
for t in thread:
t.join()
#********** End *********#
if __name__ == '__main__':
parsePage()