在一些实际项目中,经常需要进行网页截图,这里就分享一个通过scrapy+selenium进行网页截图。
首先我们需要安装selenium库
pip intall selenium
当我们使用selenium进行截图时通常使用的方法类是:
get_screenshot_as_file(截图保存路径)
# -*- coding:utf-8 -*-
from selenium import webdriver
from time import sleep
driver = webdriver.Chrome()
driver.get('http://www.baidu.com/')
# 通常情况下我们需要等待页面加载完成后截图 所以此处睡眠2秒
driver.get_screenshot_as_file("./截图.png")
sleep(2)
driver.quit()
那么如何利用scrapy联合 selenium呢
首先我们创建一个类 然后定义好 登录帐号密码 各种组件xpath路径等变量
class BjobOrderRecordSpider(scrapy.Spider):
# 爬虫名字
name = 'bjob_record_spider'
# 用户名输入框
username_input_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[1]/div/div[1]/input'
username = 'xxxx'
# 用户名输入框
password_input_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[2]/div/div[1]/input'
password = 'xxx'
login_url = 'http://xxx.cn/b2badmin/order/orderList'
# 截图来源
load_execl_data = 'F:\\xxxxx.xlsx'
# 来源 sheet
sheet1 = '办公用品及低值易耗品'
# 截图保存文件header
img_save_path_name = '文件名'
# 截图保存文件header
img_save_order_sn_name = 'EC单号'
# 登录按钮
login_eml = '//*[@id="__layout"]/div/div[1]/div/div/div/div[2]/div/div[2]/form[1]/div[3]/div/button'
# 搜索按钮
seach_q_eml = '/html/body/div/div/div[3]/section/div/div[2]/section/div[1]/div/div[1]/div/label[1]/span'
# 搜索按钮
seach_eml = '//*[@id="app"]/div/div[3]/section/div/div[1]/div/div[1]/div[2]/button[1]'
# 搜索按钮
detail_eml = '/html/body/div[1]/div/div[3]/section/div/div[2]/section/div[2]/div[1]/div[3]/table/tbody/tr/td[6]/div/span/span/span/span/div/span'
detail_eml2 = '/html/body/div[1]/div/div[3]/section/div/div[2]/section/div[2]/div[1]/div[3]/table/tbody/tr/td[6]/div/span/span/span/span/div/div'
# 搜索单号输入框 /html/body/div/div/div[3]/section/div/div[1]/div/div[1]/div[1]/form/div[1]/div/div[2]/input
qgdid_eml = '/html/body/div/div/div[3]/section/div/div[1]/div/div[1]/div[1]/form/div[1]/div/div[2]/input'
# 截图保存目录
shot_dir = 'F:\\xxxx\\网页截图\\xxxx\\'
然后再 初始化一个 selenium浏览器 对象
def __init__(self, *args, **kwargs):
self.pool3 = self.mysql_connection3()
# driver = 'Chrome'
driver = 'Firefox'
if driver == 'Chrome':
# ### 指定使用的浏览器 chrome
chrome_options = ChromeOptions()
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
chrome_options.add_argument('window-size=1920x1080') # 指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug # 禁用GPU加速
chrome_options.add_argument('--start-maximized') # 浏览器最大化
chrome_options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"') # 设置请求头的User-Agent
# chrome_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
chrome_options.add_argument('--incognito') # 隐身模式(无痕模式)
# chrome_options.add_argument('--disable-javascript') # 禁用javascript #
# chrome_options.add_argument('--ignore-certificate-errors') # 禁用扩展插件并实现窗口最大化 #
# chrome_options.add_argument('–disable-software-rasterizer') #
# chrome_options.add_argument('--disable-extensions') #
self.browser = webdriver.Chrome(
# executable_path='E:/workspace/mypython/zcygov/zcygov/tool/chromedriver_win32/chromedriver.exe',
# executable_path='E:/workspace/mypython/zcygov/zcygov/tool/chromedriver_win32/chromedriver_modify.exe',
# executable_path='E:/workspace/mypython/zcygov/zcygov/tool/vip/chromedriver.exe',
chrome_options=chrome_options
# desired_capabilities=caps
)
else:
# ### 指定使用的浏览器 Firefox
firefox_options = FirefoxOptions()
firefox_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
firefox_options.add_argument('--window-size=1920x1080') # 指定浏览器分辨率
firefox_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug # 禁用GPU加速
# firefox_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
# firefox_options.add_argument('--blink-settings=imagesEnabled=false') # 不加载图片, 提升速度
# firefox_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
firefox_options.add_argument('--start-maximized') # 浏览器最大化
# firefox_options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"') # 设置请求头的User-Agent
# firefox_options.add_argument(
# '--user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0"') # 设置请求头的User-Agent
firefox_options.add_argument('--disable-infobars') # 禁用浏览器正在被自动化程序控制的提示
firefox_options.add_argument('--incognito') # 隐身模式(无痕模式)
# firefox_options.add_argument('--disable-javascript') # 禁用javascript #
# firefox_options.add_argument('--ignore-certificate-errors') # 禁用扩展插件并实现窗口最大化 #
# firefox_options.add_argument('–disable-software-rasterizer') #
# firefox_options.add_argument('--disable-extensions') #
# profile_directory = r'C:\Users\Administrator\AppData\Roaming\Mozilla\Firefox\Profiles\s92h17dk.default-release'
# profile = webdriver.FirefoxProfile(profile_directory)
self.browser = webdriver.Firefox(
# profile
# executable_path='F:\spider_project\shuyingSpider\geckodriver\geckodriver.exe',
firefox_options=firefox_options,
)
# ######分隔线#######
self.driver = None # 实例selenium
self.cookies = None # 用来保存cookie
super(BjobOrderRecordSpider, self).__init__(*args, **kwargs)
之后便是业务代码 如 登录、获取订单列表 获取订单详情 截图 等模块的封装
def start_requests(self):
# return self.login() # 登录
self.browser.get(self.login_url)
username = self.browser.find_element_by_xpath(self.username_input_eml)
password = self.browser.find_element_by_xpath(self.password_input_eml)
username.clear()
username.send_keys(self.username)
password.clear()
password.send_keys(self.password)
# 模拟点击“登录”按钮
self.browser.find_element_by_xpath(
self.login_eml).click()
time.sleep(2)
self.getScreenShot()
# 截图详细步骤
def getScreenShot(self):
bjob_order_list = self.select_all3("select * from xxxxx'", ())
i_num = 0
for loadd in bjob_order_list:
i_num = i_num + 1
print("截图进度:", i_num)
typeState = str(loadd["type"])
company_name = loadd["company_name"]
qgdId_e = loadd["sc_order"]
win = loadd["win"]
if win != "xx":
print("非xx %s" % qgdId_e)
continue
order_sn = loadd["img_name"]
if os.path.isdir('%s%s\\%s' % (self.shot_dir, company_name, typeState)) == False:
os.makedirs('%s%s\\%s' % (self.shot_dir, company_name, typeState))
if os.path.exists('%s\\%s\\%s\\%s-1_01.png' % (self.shot_dir, company_name, typeState, order_sn)):
print("文件已经存在", '%s\\%s\\%s\\%s-1_01.png' % (self.shot_dir, company_name, typeState, order_sn))
continue
time.sleep(1)
# 请购单号
qgdId = self.browser.find_element_by_xpath(self.qgdid_eml)
qgdId.clear()
qgdId.send_keys(str(qgdId_e).strip())
# 点击搜索
self.clickElement(self.seach_eml)
time.sleep(3)
# 搜索前置步骤
self.clickElement(self.seach_q_eml)
time.sleep(1)
# 打开点击查看详情句柄
if self.isElementPresent(self.detail_eml) == False:
if self.isElementPresent(self.detail_eml2) == False:
continue
else:
self.clickElement(self.detail_eml2)
else:
self.clickElement(self.detail_eml)
# self.clickElement(self.detail_eml)
time.sleep(3)
windows = self.browser.current_window_handle # 定位当前页面句柄
time.sleep(2)
all_handles = self.browser.window_handles # 获取全部页面句柄
if len(all_handles) >= 2:
self.screenshotByhandles(all_handles, windows, order_sn, company_name, typeState)
self.browser.switch_to.window(windows)
# 切换网页句柄
def screenshotByhandles(self, all_handles, windows, order_sn, company_name, typeState):
i = 0
for handle in all_handles: # 遍历全部页面句柄
if handle != windows: # 判断条件
i = i + 1
s = str(i)
self.browser.switch_to.window(handle) # 切换到新页面
width = self.browser.execute_script("return document.documentElement.scrollWidth")
height = self.browser.execute_script("return document.documentElement.scrollHeight")
time.sleep(1)
# 将浏览器的宽高设置成刚刚获取的宽高
self.browser.set_window_size(width, 2048)
# 详情截图
self.clickElement('//*[@id="tab-first"]')
self.shotToFile(order_sn, s, company_name, typeState, 1)
# 商品明细
self.clickElement('//*[@id="tab-second"]')
self.shotToFile(order_sn, s, company_name, typeState, 2)
# 截图
def shotToFile(self, order_sn, s, company_name, typeState, type=1):
try:
pathscreenshot = '%s%s\\%s\\%s-%s_0%s.png' % (
self.shot_dir, company_name, typeState, order_sn, s, str(type))
pathscreenshot = str(pathscreenshot).replace('\t', '').replace('\n', '').strip()
self.browser.get_screenshot_as_file(pathscreenshot)
if type == 2:
self.browser.close()
except Exception as e:
print(e)
time.sleep(2)
self.shotToFile(order_sn, s, type, company_name, typeState)
# 判断元素标签是否存在
def isElementPresent(self, by):
"""
用来判断元素标签是否存在,
"""
try:
# element = self.browser.find_element(by=by, value=value)
# element = self.browser.find_element(by=by)
self.browser.find_element_by_xpath(by)
# 原文是except NoSuchElementException, e:
except NoSuchElementException as e:
# 发生了NoSuchElementException异常,说明页面中未找到该元素,返回False
return False
else:
# 没有发生异常,表示在页面中找到了该元素,返回True
return True
# 获取execl数据
def loadExeclData(self, path, sheetname="Sheet1"):
data_path = path # 文件的绝对路径
sheetname = sheetname
get_data = ExcelData(data_path, sheetname, 0) # 定义get_data对象
datas = get_data.readExcel(1)
return datas