Python Selenium结合使用的一个小例子

之前对Py感兴趣主要是平常喜欢爬一些小黄图。。。首先我的环境是在window上面的,selenium 的安装使用pip install selenium 安装即可,PhantomJS不需要安装直接下载压缩包即可。
先看下主要的效果图
效果图拿去
先看下一个简单的demo_1.py

from selenium import webdriver
driver = webdriver.PhantomJS(executable_path="C:/D-Dir/phantomjs-2.1.1-windows/bin/phantomjs.exe")

driver.get("http://www.csdn.net")
data = driver.title
driver.save_screenshot('csdn.png')
print(data)

很简单地访问http://www.csdn.net,然后保存网页为图片到本地

看下一般的demo_2.py

import time

from selenium import webdriver

UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
header_info = {
    'user-agent': UA,
}
chromedriver_path = 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chromedriver_path)
url = 'https://www.duitang.com/search/?kw=文豪野犬&type=feed'


def test_2():
    driver.get(url=url)
    time.sleep(1)
    for i in range(10):
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(1)
    title = driver.title
    print('driver.title===%s' % title)
    content = driver.page_source
    with open('test.html', 'w', encoding='utf-8') as f:
        f.write(content)
    mbpho_list = driver.find_elements_by_class_name('mbpho')
    count = 1
    for mbp in mbpho_list:
        src = mbp.find_element_by_tag_name('img').get_attribute('src')
        print('当前是第%d页,,,%d张图片,地址是%s' % (1, count, src))
        count += 1
    page_sth = driver.find_element_by_class_name('woo-pager')
    page_total = 1
    if page_sth:
        a_list = page_sth.find_elements_by_tag_name('a')
        for a in a_list:
            page_str = a.text
            try:
                page = int(page_str)
                if page > page_total:
                    page_total = page
            except Exception as e:
                print(e)
    print('获取到的最大页数是==%d' % page_total)
    if page_total < 2:
        return
    for i in range(1, page_total):
        page_num = i + 1
        temp_url = 'https://www.duitang.com/search/?kw=%E6%96%87%E8%B1%AA%E9%87%8E%E7%8A%AC&type=feed#!s-p' + str(
            page_num)
        test_3(page_num, temp_url)


def test_3(page_num, url):
    driver.get(url=url)
    time.sleep(1)
    for i in range(10):
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(1)
    mbpho_list = driver.find_elements_by_class_name('mbpho')
    count = 1
    for mbp in mbpho_list:
        src = mbp.find_element_by_tag_name('img').get_attribute('src')
        print('当前是第%d页,,,%d张图片,地址是%s' % (page_num, count, src))
        count += 1


if __name__ == '__main__':
    test_2()

说下运行结果吧,读取demo访问网址里面所有的图片信息然后打印出来,这里需要注意的是使用了

   for i in range(10):
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(1)

循环滚动到浏览器底部

最后看下有问题的demo_3.py

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
header_info = {
    'user-agent': UA,
}
PhantomJS = 'C:/D-Dir/phantomjs-2.1.1-windows/bin/phantomjs.exe'
service_args = []
service_args.append('--load-images=no')  ##关闭图片加载
service_args.append('--disk-cache=yes')  ##开启缓存
service_args.append('--ignore-ssl-errors=true')  ##忽略https错误
browser = webdriver.PhantomJS(executable_path=PhantomJS, service_args=service_args)
url = 'https://www.duitang.com/search/?kw=文豪野犬&type=feed'


def test_2():
    browser.get(url=url)
    time.sleep(5)
    print(browser.page_source)
    # 等待登陆页面加载完成
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'mbpho')))
    if is_ele_exist(browser):
        print("登录失败")
        browser.quit()
        return None
    print("登陆成功")


def is_ele_exist(browser):
    try:
        mbpho_list = browser.find_elements_by_class_name('mbpho')
        return True
    except:
        return False


if __name__ == '__main__':
    test_2()

这里获取出来的browser.page_source并没有图片数据,用了util还是不行;如果有大神知道请告知我怎么解决一下,谢谢。。。

展开阅读全文

没有更多推荐了,返回首页