selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,
这是一个简单的demo,用于爬取Google APP Store中的一个类别:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import sqlite3
import sys
# connect the sqlite3
def Conn_DB(db_name = 'app_info.db'):
try:
conn = sqlite3.connect(db_name)
except Exception, e:
print "Conn Error ", e
return conn
# get the category of the apps
def Get_Category(root_address):
url_list = root_address.split('/')
return url_list[-1].replace("?",' ').split(' ')[0]
# we have to login so that to get the info from every app
def Login_Google(browser, category_root_address):
browser.get(category_root_address)
# click to login
login_link = browser.find_element_by_id('gb_70')
webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()
# input your email here
email = browser.find_element_by_name('Email')
# you should input your email here
email.send_keys('')
# input your password here
pwd = browser.find_element_by_name('Passwd')
# you should input your password for your email here
pwd.send_keys('')
pwd.send_keys(Keys.RETURN)
print 'Login Success'
# load the whole page and then return the number of the apps under the category
def Load_All_Apps(browser):
# try to load the whole page to select want I want, the magic number 13 is based on the test
for times in xrange(13):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(2.5)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
sleep(2.5)
print times
# click the show more button to load more apps
show_more_button = browser.execute_script("return document.querySelector('#show-more-button')['style']['cssText'];")
if show_more_button != 'display: none;':
browser.execute_script("document.querySelector('#show-more-button').click();")
print 'click button'
print show_more_button
# to the bottom of the page
browser.execute_script("window.scrollTo(0, 0);")
number = browser.execute_script("return document.querySelectorAll('button.price').length;")
print number
return number
def Click_Install_Button(browser, category_root_address):
get_permissions_code = """var permissions = document.querySelectorAll('.perm-description');
var precise_locaton = 'precise location (GPS and network-based)';
var approximate_location = 'approximate location (network-based)';
var ways = '';
for (var perm in permissions) {
if (permissions[perm].innerHTML == precise_locaton) {
ways += 'p';
} else if (permissions[perm].innerHTML == approximate_location) {
ways += 'a';
}
}
return ways;"""
# get all install button objects
get_button_list_code = """return document.querySelectorAll('button.price');"""
button_list = browser.execute_script(get_button_list_code)
# print dir(button_list[0])
# button_list.reverse()
numbers_of_button = len(button_list)
count = 0
# index = 1
sleep(3)
#webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
#sleep(1)
#browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
#webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
#sleep(1)
#browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
category = Get_Category(category_root_address)
get_app_address_code = """var app_address_list = document.querySelectorAll("h2 a");var list = [];
for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i]['href']);} return list;"""
address_list = browser.execute_script(get_app_address_code)
conndb = Conn_DB()
db_cursor = conndb.cursor()
number_of_i_want = 0
insert_sql = u"""insert into app_info (categroy, name, link, get_geo_ways) values ('{0}', '{1}', '{2}', '{3}')"""
for index in range(1, numbers_of_button, 2):
try:
webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
sleep(3.5)
count += 1
#index += 2
except IndexError:
print "Out of index"
break
try:
print "Count ", count
perms = browser.execute_script(get_permissions_code)
sleep(2)
appname = browser.execute_script("return document.querySelector('.purchase-header .title').innerHTML;")
print u"App id is: ", appname , u"Perm is: ", perms, u"Address is: ", address_list[count - 1]
if perms:
sql_with_data = insert_sql.format(category, appname, address_list[count - 1], perms)
db_cursor.execute(sql_with_data)
conndb.commit()
number_of_i_want += 1
except Exception, e:
print "Error for ", e, "Number is ", count, "Pers is", perms
continue
# click cancle button
browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
sleep(1)
print "compary ", count , numbers_of_button, "I want :", number_of_i_want
db_cursor.close()
conndb.close()
# print browser.execute_script()
if __name__ == '__main__':
root_address = 'https://play.google.com/store/apps/category/TRAVEL_AND_LOCAL?hl=en'
driver = webdriver.Chrome()
Login_Google(driver, root_address)
Load_All_Apps(driver)
Click_Install_Button(driver, root_address)
#sys.exit()
fd = file("./res.txt", "wb")
fd.write("over")
fd.close()