1、运用
#!/usr/bin/python
#encoding=utf-8
import sys
from selenium import webdriver
reload(sys)
sys.setdefaultencoding( "utf-8" )
driver = webdriver.PhantomJS(executable_path='/home/lhy/phantomjs-1.9.8-linux-x86_64/bin/phantomjs')
driver.get("http://item.jd.com/2914823.html")
#driver.find_element_by_id('search_form_input_homepage').send_keys("Nirvana")
#driver.find_element_by_id("search_button_homepage").click()
print driver.page_source
fo = open("aaaa1.txt", "wb")
fo.write(driver.page_source)
fo.close()
driver.quit()
2、抓取下拉加载的页面
#coding=utf-8
import requests
import re
import time
from pyquery import PyQuery as pq
from lxml import etree
from bs4 import BeautifulSoup
import sys
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
reload(sys)
sys.setdefaultencoding("utf-8")
urls=[]
def getHtml2(url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; windows NT)'
headers = {'User-Agent': user_agent}
r = requests.post(url, headers=headers)
fo = open("phonesinfo1.txt", "wb")
fo.write(r.content)
fo.close()
#print r.content
return r.content
def getHtml(url):
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/4.0 (compatible; MSIE 5.5; windows NT)"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
#driver = webdriver.PhantomJS(executable_path='/home/lhy/phantomjs-1.9.8-linux-x86_64/bin/phantomjs')
#driver=webdriver.Chrome()
driver.get(url)
<span style="color:#FF0000;"> js="document.body.scrollTop=1000"#滚动条下拉1000px
driver.execute_script(js)</span>
driver.implicitly_wait(30)
#time.sleep(5)
#fo = open("phonesinfo2.txt", "wb")
#fo.write(driver.page_source)
#fo.close()
html=driver.page_source
driver.quit()
return html
def getPqHtml(html):
pqHtml = pq(html)
return pqHtml
def getUrlsFromFile(fileUrl):
with open('phoneurl.txt', 'r') as f:
lines = f.readlines()
for line in lines:
url_one = line.strip()
print url_one
urls.append(url_one)
url="http://localhost:8080/pro/html.html"
text=getHtml(url)
fo = open("taobao2.txt", "wb")
fo.write(text)
fo.close()
print text
html 页面
<html>
<head>
</head>
<body style="height:5000px">
<div id="top_div" style="display:none">ffffffffffffffffffffff</div>
<script>
//document.body.scrollTop=10000;
window.onscroll = function(){
var t = document.documentElement.scrollTop || document.body.scrollTop;
var top_div = document.getElementById( "top_div" );
if( t >= 300 ) {
// alert(t);
top_div.style.display = "block";
}
// else { top_div.style.display = "none";}
}
</script>
</body>
</html>
3、模拟登陆
# coding = utf-8
from selenium import webdriver
browser = webdriver.Firefox()
<span style="color:#FF0000;">browser.get("http://localhost:8080/pro")
browser.find_element_by_name("password").clear() #先清除文本框上密码
browser.find_element_by_name("username").send_keys("test") #设置值
browser.find_element_by_name("password").send_keys("123") #设置值
yzm=browser.find_element_by_class_name("yzm-img").find_element_by_tag_name("span").text #获取验证码值
yzm=yzm.replace(' ','') #清除空格
browser.find_element_by_class_name("yzm-sr").send_keys(yzm) #设置验证码
browser.find_element_by_id("tijiao").click() #点击按钮 提交表单
print browser.current_url
browser.get("http://localhost:8080/pro/test.jsp")#模拟登陆成功后会自动把cookie保存在对象中,对需认证页面可直接访问</span>
print browser.page_source
#browser.quit()
4、百度搜索
# coding = utf-8
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("http://www.baidu.com")
browser.find_element_by_id("kw").clear()
browser.find_element_by_id("kw").send_keys("selenium")
browser.find_element_by_id("su").click()
print browser.current_url #点击成功后调转页面的url
#browser.quit()