基本流程
-
创建实例调用函数
req = urllib3.PoolManager()
-
建立连接,构造http请求
data = req.request('GET', 'http://www.weimeitupian.com/page/{}'.format(page))
-
分析源代码,正则抓取数据:
item = re.findall(r'</a></div>-->.*?<img src="(.*?)" alt="(.*?)" class="thumb" />', data.data.decode(), re.S)
-
清洗数据并分类
完整代码:
import re, urllib3, os
def save_image(items):
for item in items:
if not item[0]:
continue
image_path = './images/{}'.format(item[1])
# print(2)
if not os.path.exists(image_path): # 判断文件夹不存在
os.mkdir(image_path)
image_url = item[0] # 有域名的直接请求
if not 'http' in item[0]: # 有些图片url没有域名
image_url = '{}{}'.format('http://www.weimeitupian.com/', item[0])
image_content = req.request('GET', image_url) # 网络io请求, 会等待响应返回,这是慢的原因
# print(image_content)
with open('{}/{}'.format(image_path, item[0].split('/')[-1]), 'wb')as f:
f.write(image_content.data)
if __name__ == '__main__':
req = urllib3.PoolManager()
for page in range(1, 4):
print('正在下载第{}页的数据...'.format(page))
data = req.request('GET', 'http://www.weimeitupian.com/page/{}'.format(page))
# print(data)
item = re.findall(r'</a></div>-->.*?<img src="(.*?)" alt="(.*?)" class="thumb" />', data.data.decode(), re.S)
# print(item)
save_image(item)
使用多线程
import re
import urllib3
import os
import time
from threading import Thread
def image_request(url, item, image_path):
image_content = req.request('GET', url) # 网络io请求, 会等待响应返回,这是慢的原因
with open('{}/{}'.format(image_path, item[0].split('/')[-1]), 'wb')as f:
f.write(image_content.data)
def save_image(items):
for item in items:
if not item[0]:
continue
image_path = './images/{}'.format(item[1])
if not os.path.exists(image_path): # 判断文件夹不存在
os.mkdir(image_path)
image_url = item[0] # 有域名的直接请求
if not 'http' in item[0]: # 有些图片url没有域名
image_url = '{}{}'.format('http://www.weimeitupian.com/', item[0])
# 创建线程实例并启动
t = Thread(target=image_request, args=(image_url, item, image_path))
t.start()
if __name__ == '__main__':
stat_time = time.time()
req = urllib3.PoolManager()
for page in range(1, 4):
print('正在下载第{}页的数据...'.format(page))
data = req.request('GET', 'http://www.weimeitupian.com/page/{}'.format(page))
# print(data)
item = re.findall(r'</a></div>-->.*?<img src="(.*?)" alt="(.*?)" class="thumb" />', data.data.decode(), re.S)
# print(item)
save_image(item)
end_time = time.time()
print('耗时{}s'.format(end_time-stat_time))