python100day - day23-SeleniumAndProxy
1.selenium的基本设置
from selenium import webdriver
from selenium. webdriver import ChromeOptions
options = ChromeOptions( )
options. add_experimental_option( 'excludeSwitches' , [ 'enable-automation' ] )
options. add_experimental_option( "prefs" , { "profile.managed_default_content_settings.images" : 2 } )
b = webdriver. Chrome( options= options)
b. get( 'https://www.jd.com/' )
2.selenium交互
import time
from selenium import webdriver
from selenium. webdriver import ActionChains
b = webdriver. Chrome( )
def jing_dong ( ) :
b. get( 'https://www.jd.com/' )
login_btn = b. find_element_by_css_selector( '.user_login' )
login_btn. click( )
user_btn = b. find_element_by_css_selector( '.login-tab.login-tab-r' )
user_btn. click( )
user_name = b. find_element_by_css_selector( '#loginname' )
password = b. find_element_by_css_selector( '#nloginpwd' )
user_name. send_keys( 'aaa' )
password. send_keys( '123456' )
login_btn = b. find_element_by_css_selector( '.login-btn' )
login_btn. click( )
slider = b. find_element_by_css_selector( '.JDJRV-slide-btn' )
action = ActionChains( b)
action. click_and_hold( slider) . perform( )
action. drag_and_drop_by_offset( slider, 100 , 0 ) . perform( )
time. sleep( 3 )
action. click_and_hold( slider) . perform( )
action. drag_and_drop_by_offset( slider, 100 , 0 ) . perform( )
def scroll ( ) :
b. get( 'https://jd.com' )
js = """
height = 100
//添加定时器,每隔300毫秒滚动200像素
t = setInterval(function(){
max = document.body.scrollHeight
window.scrollTo(0, height)
height += 200
if(height > max){
clearInterval(t)
}
}, 300)
"""
b. execute_script( js)
if __name__ == '__main__' :
scroll( )
3.网易邮箱(嵌套页面)
from selenium import webdriver
from selenium. webdriver import ChromeOptions
options = ChromeOptions( )
options. add_experimental_option( 'excludeSwitches' , [ 'enable-automation' ] )
b = webdriver. Chrome( options= options)
b. get( 'https://mail.163.com/' )
frame = b. find_element_by_css_selector( '#loginDiv>iframe' )
b. switch_to. frame( frame)
user_name = b. find_element_by_name( 'email' )
password = b. find_element_by_name( 'password' )
login_btn = b. find_element_by_id( 'dologin' )
user_name. send_keys( 'y_t209' )
password. send_keys( '123456' )
login_btn. click( )
4.等待
import time
from selenium import webdriver
from selenium. webdriver import ChromeOptions
from selenium. webdriver. common. keys import Keys
from selenium. webdriver. support. ui import WebDriverWait
from selenium. webdriver. support import expected_conditions as EC
from selenium. webdriver. common. by import By
options = ChromeOptions( )
options. add_experimental_option( 'excludeSwitches' , [ 'enable-automation' ] )
b = webdriver. Chrome( options= options)
b. get( 'https://www.51job.com/' )
input = b. find_element_by_id( 'kwdselectid' )
input . send_keys( 'python' )
input . send_keys( Keys. ENTER)
for _ in range ( 10 ) :
print ( b. page_source)
time. sleep( 2 )
wait = WebDriverWait( b, 10 )
next = wait. until( EC. element_to_be_clickable( ( By. CLASS_NAME, 'next' ) ) )
try :
next . click( )
except :
time. sleep( 1 )
5.获取代理
import requests
def get_ip ( ) :
url = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=5&expiryDate=0&format=2&newLine=3'
response = requests. get( url)
if response. status_code == 200 :
if response. text[ 0 ] == '{' :
print ( '获取ip失败' )
else :
return [ x for x in response. text. split( '\n' ) if x]
else :
print ( '请求失败' )
def use_proxy ( ) :
ips = get_ip( )
if ips:
proxy = { 'http' : ips[ 0 ] , 'https' : ips[ 1 ] }
print ( proxy)
response = requests. get( 'https://cd.fang.anjuke.com/loupan/all/p1/' , proxies= proxy)
if response. status_code == 200 :
print ( response. text)
else :
print ( '请求失败!' , response)
else :
print ( '获取ip失败!' )
use_proxy( )
6.正则数据解析
import requests
import re
def get_data ( ) :
url = 'https://movie.douban.com/top250'
header = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
response = requests. get( url, headers= header)
if response. status_code == 200 :
analysis_data( response. text)
else :
print ( '请求失败' )
def analysis_data ( data) :
"""解析数据"""
print ( data)
print ( '\n\n' )
re_str = r'(?s)<li>.+?<span class="title">(.+?)</span>.+?<span class="rating_num" property="v:average">(.+?)</span>.+?<span>(.+?)</span>.+?<span class="inq">(.+?)</span>.+?</li>'
result = re. findall( re_str, data)
print ( result)
if __name__ == '__main__' :
get_data( )