百度图片获取,进程池
# coding=utf-8
import multiprocessing
from selenium import webdriver
import requests
import os
import time
def geturls(key,scrollnum):
url = 'http://image.baidu.com/'
browser = webdriver.Chrome()
browser.get(url)
browser.maximize_window()
inputtext = browser.find_element_by_id('kw')
inputtext.send_keys(key)
for i in range(scrollnum):
time.sleep(1)
browser.execute_script("window.scrollTo(0,document.body.scrollHeight);")
browser.implicitly_wait(10)
elements = browser.find_elements_by_class_name('imgitem') #关键之处1
urls = ''
for element in elements:
url = element.get_attribute('data-objurl')
urls = urls + url + ','
browser.close()
urls[:-1] #删除最后一个逗号
urls = urls.split(',') #返回序列
print(len(urls))
#返回二维序列
n = int(0.1 * len(urls))
urls_lists = []
[urls_lists.append(urls[i:i+n]) for i in range(0,len(urls),n)]
return urls_lists
def save_img(urls):
root = 'I:/pics/佟丽娅/'
for url in urls:
path = root + url.split('/')[-1] + '.jpg'
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
else:
print('图片已存在!')
except:
print('爬取失败')
if __name__ == '__main__':
starttime = time.time()
urls_lists = geturls('佟丽娅\n',2) #输入两个参数,一个是搜索内容,一个是数量,以ajax下拉次数为单位。
time.sleep(1)
pool = multiprocessing.Pool()
pool.map(save_img,urls_lists)
pool.close()
pool.join()
finishtime = time.time()
print(finishtime - starttime)