文章目录
前言
通过对一万张浏览器图片的爬取,快速上手Selenium与Requests两大爬虫工具,本文思路是先通过selenium定位浏览器图片网页链接并将链接存到本地上,再用requests库对本地上的图片链接进行爬取。
一、通过selenium定位图片链接
1.读入要爬取图片的关键字
读入excel关键字文件代码:
def load_date(file_name):
with open(file_name) as f:
reader = csv.reader(f)
header_row = next(reader)
content = []
for row in reader:
print(row)
content.append(row)
return content #返回文件关键字列表
2.定位元素及抓取链接
初始化webdriver对象:
# 使用代理ip
options.add_argument('--proxy-server=http://127.0.0.1:10809')
wd = webdriver.Chrome(service=Service(r'D:\Study\chromedriver.exe'), options=options)
wd.implicitly_wait(5)
先进入浏览器图片网站,随便输一个关键字
通过浏览器自带的开发者工具用xpath或者css selector找到输入框(需要输入每个关键字去找图片)以及每张图片的链接,如图所示:
不难看出,输入框定位可以用:
element = wd.find_element(By.XPATH, "//*[@id='REsRA']")
图片定位可以用:
elements = wd.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a img")
在往下拉过程中可以发现,浏览器图片不点击更多图片的话,是显示400张图片,这里我们不用点击更多,一个关键字就抓取400张图片。但在用selenium下拉过程中不能直接拉到底,这样会缺失大量获取的图片链接,返回空值,所以需要一截一截的往下拉,慢慢获取图片的链接。代码如下:
for i in range(32):
# 获取页面初始高度
# js = "return action=document.body.scrollHeight"
# height = wd.execute_script(js)
# print(height) # 高度为3512
# 将滚动条下拉
wd.execute_script('window.scrollBy(0, 1000)')
sleep(5)
抓取过程中,可以用多线程加速抓取:
def multi_thread(content):
print("multi_thread begin")
threads = []
# 创建多线程找到分别获取25个关键词图片链接
for i in range(25):
threads.append(
Thread(target=craw, args=(",".join(content[i]),))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
最后将selenium抓取到的链接用txt文件保存到本地
3.完整代码
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
import time
import csv
from threading import Thread
def load_date(file_name):
with open(file_name) as f:
reader = csv.reader(f)
header_row = next(reader)
content = []
for row in reader:
print(row)
content.append(row)
return content
def multi_thread(content):
print("multi_thread begin")
threads = []
# 创建多线程找到分别获取25个关键词图片链接
for i in range(25):
threads.append(
Thread(target=craw, args=(",".join(content[i]),))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
def craw(key_word):
prefs = {
'profile.default_content_setting_values': {
'notifications': 2 #隐藏chromedriver的通知
},
'credentials_enable_service':False, #隐藏chromedriver自带的保存密码功能
'profile.password_manager_enabled':False #隐藏chromedriver自带的保存密码功能
}
# 创建一个配置对象
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', prefs)
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 设置为开发者模式,禁用chrome正受到自动化检测的显示
# 设置无头模式
# options.add_argument('--no-sandbox')# 解决DevToolsActivePort文件不存在的报错
# options.add_argument('--disable-dev-shm-usage')
# options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
# options.add_argument('--headless')
# 使用代理ip
options.add_argument('--proxy-server=http://127.0.0.1:10809')
wd = webdriver.Chrome(service=Service(r'D:\Study\chromedriver.exe'), options=options)
wd.implicitly_wait(5)
url = "https://www.google.com/search?q=barber&sxsrf=ALiCzsYUXHmAB-W1-C92ipNPeH_WRVUb0A:1657352656150&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiGiNnip-v4AhVOl2oFHbZEDr4Q_AUoAnoECAMQBA&biw=1536&bih=754&dpr=1.25"
wd.get(url=url)
sleep(5)
element = wd.find_element(By.XPATH, "//*[@id='REsRA']")
element.clear()
element.send_keys(key_word + '\n')
sleep(2)
# print(wd.page_source)
# 爬取动态网站必备代码:需要爬取网站大量数据就可以这么做,一直往下拉滚动条使数据显示出来
for i in range(32):
# 获取页面初始高度
# js = "return action=document.body.scrollHeight"
# height = wd.execute_script(js)
# print(height) # 高度为3512
# 将滚动条下拉
wd.execute_script('window.scrollBy(0, 1000)')
sleep(5)
sleep(5)
elements = wd.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a img")
print("elements's len:")
print(len(elements))
f = open(key_word + ".txt", "a", encoding="utf-8")
for i in range(400):
print(i)
href = elements[i].get_attribute('src')
if href == None:
print(href)
continue
print(href)
f.write("".join(href) + '\n')
print('\n')
if i % 50 == 0:
sleep(10)
f.close()
wd.close()
if __name__ == '__main__':
file_name = r'C:\Users\HP\Desktop\graduate student\1.csv'
content = load_date(file_name)
sleep(2)
# 将列表list转换成字符串
# str_content = ",".join(content[0])
# print(str_content)
start = time.time()
multi_thread(content)
end = time.time()
print("cost:", end - start)
# f = open(str_content+'.txt','a',encoding='utf-8')
# for i in range(25):
# f.write(",".join(content[i])+'\n')
# f.close()
结果如下图所示:
可以看到下载的链接里不完全是图片的url,还有浏览器缓存的数据,这些数据不能够通过requests库进行抓取,需要进行解码后获得,详细代码看第二部分。
二、requests下载图片
1.获取文件夹下各个txt文件
通过os模块及filter和lambda函数过滤各个txt文件:
# 读取文件夹下txt文件
path = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask"
# path1 = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask\图库"
files = os.listdir(path)
print(files)
txt_files = list(filter(lambda x: x[-4:] == '.txt', files))
print(txt_files)
2.获取图片
代码如下:
# 下载图片并保存到各个文件夹下
def getpicture(file):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
'Referer': 'https://www.google.com/'
}
count = len(open(file, 'r').readlines())
print(count)
# 判断以关键字命名的文件夹是否存在,若不存在则创建文件夹
if file not in os.listdir(path1):
# print(file.title().split('.')[0])
os.makedirs(f'{path1}/{file.title().split(".")[0]}')
i = 0
with open(file, 'r', encoding='utf-8') as f:
for url in f:
print(url)
if url == None:
continue
if "data:image/jpeg" in url:
# 注意需要删去前端描述字符
url = url.split(',')[1]
print(url)
img = base64.b64decode(url)
fh = open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb")
fh.write(img)
fh.close()
i += 1
continue
i += 1
proxy = {
"http": "http://127.0.0.1:10809",
"https": "http://127.0.0.1:10809"
}
# proxy = proxy_random()
response = requests.get(url=url, headers=headers, proxies=proxy, verify=False)
with open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb") as f1:
f1.write(response.content)
f1.close()
# 每爬取一张图片暂停一秒防止ip被封
sleep(0.25)
f.close()
3.完整代码及爬取结果
import requests
import os
import linecache
import random
import time
from time import sleep
import base64
from threading import Thread
def multi_thread(content):
print("multi_thread begin")
threads = []
# 创建多线程
for i in range(25):
threads.append(
Thread(target=getpicture,args=("".join(content[i]),))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
# 获取随机代理ip
def proxy_random():
a = random.randrange(1,150)
# 从文件1.txt中对读取第a行的数据
proxy = linecache.getline(r'C:\Users\HP\PycharmProjects\untitled\Requests\1.txt',a).rstrip('\n')
proxies = {
"http": "http://" + proxy,
"https": "http://" + proxy,
}
print(proxies)
return proxies
# 下载图片并保存到各个文件夹下
def getpicture(file):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
'Referer': 'https://www.google.com/'
}
count = len(open(file, 'r').readlines())
print(count)
# 判断以关键字命名的文件夹是否存在,若不存在则创建文件夹
if file not in os.listdir(path1):
# print(file.title().split('.')[0])
os.makedirs(f'{path1}/{file.title().split(".")[0]}')
i = 0
with open(file, 'r', encoding='utf-8') as f:
for url in f:
print(url)
if url == None:
continue
if "data:image/jpeg" in url:
# 注意需要删去前端描述字符
url = url.split(',')[1]
print(url)
img = base64.b64decode(url)
fh = open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb")
fh.write(img)
fh.close()
i += 1
continue
i += 1
proxy = {
"http": "http://127.0.0.1:10809",
"https": "http://127.0.0.1:10809"
}
# proxy = proxy_random()
response = requests.get(url=url, headers=headers, proxies=proxy, verify=False)
with open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb") as f1:
f1.write(response.content)
f1.close()
# 每爬取一张图片暂停一秒防止ip被封
sleep(0.25)
f.close()
if __name__ == '__main__':
# 读取文件夹下txt文件
path = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask"
path1 = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask\图库"
files = os.listdir(path)
print(files)
txt_files = list(filter(lambda x: x[-4:] == '.txt', files))
print(txt_files)
# 测试一个数据集
# getpicture(txt_files[0])
start = time.time()
multi_thread(txt_files)
end = time.time()
print("cost:", end - start)
最终结果如下:
总结
以上就是今天要讲的内容,本文通过selenium和requests爬取大量浏览器图片,需要注意的重点是,浏览器图片下拉的操作需要时间加载,所以用selenium模拟下拉操作非常重要。以及url和浏览器缓存图片的区别。