selenium使用代理IP
from selenium.webdriver import Chrome,ChromeOptions
options = ChromeOptions()
options.add_argument('--proxy-server=http://代理服务器:端口')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250')
selenium基本配置
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = Chrome(options=options)
b.get('http://jd.com/')
selenium等待
from selenium.webdriver import Chrome, ChromeOptions
"""
如果没有设置隐式等待:在通过浏览器获取标签的时候,如果标签不存在会直接报错
如果设置了隐式等待:在通过浏览器获取标签的时候,如果标签不存在不会直接报错,不会马上报错,
而是在指定时间范围内不断尝试重新获取标签,直到获取到标签或者超时为止(如果超时会报错)
一个浏览器只需要设置一次隐式等待时间,它会作用于这个浏览器每次获取标签的时候
"""
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = Chrome(options=options)
b.implicitly_wait(5)
input_tag = b.find_element_by_id('key')
input_tag.send_keys('钱包')
"""
1)创建等待对象:WebDriverWait(浏览器对象, 超时时间)
2)添加等待条件:
等待对象.until(条件) - 等到条件成立为止
等待对象.until_not(条件) - 等到条件不成立为止
条件的写法:
presence_of_element_located(标签) - 指定标签出现
text_to_be_present_in_element_value(标签, 指定值) - 指定标签的value属性值中包含指定值
text_to_be_present_in_element(标签, 指定值) - 指定标签的标签内容中包含指定值
注意:条件中提供标签的方式
(By.xxx, 具体的值)
"""
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
wait = WebDriverWait(b, 10)
wait.until(EC.text_to_be_present_in_element_value((By.ID, 'key'), '电脑'))
招聘岗位
from json import loads
from time import sleep
import requests
from re import *
def get_ip():
url = 'http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions='
while True:
response = requests.get(url)
if response.text[0] == '{':
print('提取失败,重试!')
sleep(1)
continue
return response.text
def get_film():
while True:
ip = get_ip()
res = requests.get(
url='https://search.51job.com/list/090200,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=&u_atoken=f992d683-e218-4b59-90ca-999de757cfb1&u_asession=01DGhgp4GFue9ucfJyf691VYhKTh7UHfrQhDVI5RdlbFchfNtoQU-AcOmfentpnK68X0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K_VFx8LMrlNUgQ_YhK6KzMdymCvuFU2gNCRIRJqGpb9omBkFo3NEHBv0PZUm6pbxQU&u_asig=05kTpl17A0roe5q06Aol0_P36AOy7Eqz1d3LxkXzeaVbFw3KdrBBLW5jP1elge9z41GtUEV5ptT_z0mlrjoyByEP-5w-PTS07JWhZrysYY8XCPGRCK4VAdT25xB_aW4I4lt4Bw_nOfA_XxqeBfqFLiAgEhsTJlw1KZ6-xIhK9LPnz9JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzb0sIWqcjUeIyFsSHWLy9BTZVxECgJBFMPyNy5Min01nb4DvBakBj6x1SID70OM96u3h9VXwMyh6PgyDIVSG1W_FA_w2zcqz2S9VVtUlVwXufrRIiRBNyR_yo_llgqb7GZd243sLXCBidAEQMxG8aprXmP2xkUiEYWRHqvk51g0WmWspDxyAEEo4kbsryBKb9Q&u_aref=%2BqMXmjdyh9fWYrYKojZD3GivLqk%3D',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
},
proxies={
'http': f'{ip}',
'https': f'{ip}'
}
)
response = res.text
data_job = search(r'window.__SEARCH_RESULT__ = (.+?)</script>', response)
if data_job:
analsis_data(data_job.group(1))
break
def analsis_data(json_data):
data = loads(json_data)
for i in data['engine_jds']:
name = i['job_name']
money = i['providesalary_text']
print(name, money)
if __name__ == '__main__':
get_film()