一. selenium的基本设置
from selenium import webdriver
from selenium.webdriver import ChromeOptions
# 1.创建测试对象
options = ChromeOptions()
# 2.取消自动测试检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 3.取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = webdriver.Chrome(options=options)
b.get('https://www.baidu.com/')
二. selenium的交互
1.京东页面的模拟登录,但是有反爬机制,所以没有登录成功
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
b = webdriver.Chrome()
def jing_dong():
# 打开京东首页
b.get('https://www.jd.com/')
# 获取登录按钮并点击
login_btn = b.find_element_by_css_selector('.user_login')
login_btn.click()
# 获取账号登录按钮
user_btn = b.find_element_by_css_selector('.login-tab.login-tab-r')
user_btn.click()
# 获取账号密码输入框
user_name = b.find_element_by_css_selector('#loginname')
password = b.find_element_by_css_selector('#nloginpwd')
user_name.send_keys('aaa')
password.send_keys('123456')
# 获取登录按钮
login_btn = b.find_element_by_css_selector('.login-btn')
login_btn.click()
# 拖动滑块
slider = b.find_element_by_css_selector('.JDJRV-slide-btn')
# 1)创建动作链对象
# ActionChains(浏览器对象)
action = ActionChains(b)
# 2)添加 按住指定标签不放 的动作并且执行
action.click_and_hold(slider).perform()
# 3)添加 拖拽 的动作
# drag_and_drop_by_offset(拖拽对象, x方向的偏移, y方向的偏移)
# drag_and_drop(拖拽对象, 目标对象)
action.drag_and_drop_by_offset(slider, 150, 0).perform()
# 4)添加 停顿和释放 的动作
action.pause(0.5).release().perform()
def scroll():
b.get('https://jd.com')
# b.execute_script('alert("你好吗?")')
height = 100
while height < 15000:
b.execute_script(f'windows.scrollTo(0, {height})')
height += 100
time.sleep(1)
# js =
""" height = 100
max = document.body.scrollHeight
//添加定时器,每隔200毫秒滚动200像素
t = setInterval(function(){
window.scrollTo(0, height)
height += 200
if(height > max){
clearInterval(t)
}
}, 200)"""
# b.execute_script(js)
if __name__ == '__main__':
jing_dong()
2.163邮箱的模拟登录
from selenium import webdriver
from selenium.webdriver import ChromeOptions
# 1.创建测试对象
options = ChromeOptions()
# 2.取消自动测试检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
b = webdriver.Chrome(options=options)
b.get('https://mail.163.com/')
# 需要爬取的内容在嵌套页面中(嵌套页面在iframe标签中),获取标签前需要切换页面
# 1.拿到嵌套页面对应的iframe
frame = b.find_element_by_css_selector('#loginDiv>iframe')
# 2.切换页面
b.switch_to.frame(frame)
# 3.获取嵌套页面中的标签
user_name = b.find_element_by_name('email')
password = b.find_element_by_name('password')
login_btn = b.find_element_by_id('dologin')
user_name.send_keys('')
password.send_keys('')
三. selenium的等待模块
# 1.创建测试对象
options = ChromeOptions()
# 2.取消自动测试检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
b = webdriver.Chrome(options=options)
b.get('https://www.51job.com/')
input = b.find_element_by_id('kwdselectid')
input.send_keys('python')
input.send_keys(Keys.ENTER)
for _ in range(10):
# print(b.page_source)
time.sleep(1)
# next = b.find_element_by_class_name('next')
# next.click()
# 显示等待:等到满足某个条件为止
wait = WebDriverWait(b, 10)
next = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
next.click()
四. 获取代理ip
1.requests中设置代理ip
import requests
# 该代理是自己买的,所以已经用不了了
# ===================获取代理ip=============
def get_ip():
url = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=5&expiryDate=0&format=2&newLine=3'
response = requests.get(url)
if response.status_code == 200:
if response.text[0] == '{':
print('获取ip失败')
else:
return [x for x in response.text.split('\n') if x]
else:
print('请求失败')
def use_proxy():
ips = get_ip()
if ips:
proxy = {
'http': ips[0], 'https': ips[1]
}
print(proxy)
response = requests.get('https://cd.fang.anjuke.com/loupan/all/p1/', proxies=proxy)
if response.status_code == 200:
print(response.text)
else:
print("请求失败", response)
else:
print('获取ip失败!')
if __name__ == '__main__':
use_proxy()
2.selenium中设置代理ip
from selenium import webdriver
from selenium.webdriver import ChromeOptions
import requests
def get_ip():
url = 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=5&expiryDate=0&format=2&newLine=3'
response = requests.get(url)
if response.status_code == 200:
if response.text[0] == '{':
print('获取ip失败')
else:
return [x for x in response.text.split('\n') if x]
else:
print('请求失败')
options = ChromeOptions()
ips = get_ip()
if ips:
# 添加代理
options.add_argument(f'--proxy-serve=http://{ips[0]}')
b = webdriver.Chrome(options=options)
b.get('https://cd.fang.anjuku.com/')
else:
print('请求失败')
五. 正则的数据解析
import re
import requests
def get_data():
url = 'https://movie.douban.com/top250'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
response = requests.get(url, headers=header)
if response.status_code == 200:
analysis_data(response.text)
else:
print('请求失败')
def analysis_data(data):
"""解析数据"""
# 1.提取电影名称
re_str = r'(?s)<li>.+?<span class="title">(.+?)</span>.+?<li>'
result = re.findall(re_str, data)
print(result)
if __name__ == '__main__':
get_data()