python爬虫代理和selenium
1.代理ip的使用
1.1 获取蘑菇代理中的代理ip
def get_ip():
response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0]=='{':
print('提取ip失败')
return None
return [x for x in response.text.split('\n') if x]
1.2 使用代理IP
def get_net_data():
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
ips=get_ip()
if not ips:
print('ip获取失败,等10秒后重新获取')
return
# 'http':'ip:端口'
# 'https':'ip:端口'
proxies={
'http':ips[0],
'https':ips[1]
}
response=requests.get('https://movie.douban.com/top250',headers=headers,proxies=proxies)
print(response.text)
1.3 优化代理使用
在代理获取中加一个睡眠,让代理直到获取才结束
import requests
import time
def get_ip():
response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0]=='{':
print('提取ip失败')
return None
return [x for x in response.text.split('\n') if x]
def get_net_data():
# 不断获取ip直到成功
while True:
ips=get_ip()
if ips:
break
time.sleep(10)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
proxies = {
'http': ips[0],
'https': ips[1]
}
url='https://movie.douban.com/top250'
response=requests.get(url,headers=headers,proxies=proxies)
print(response.text)
if __name__ == '__main__':
get_net_data()
2.selenium的基本功能
2.1 创建浏览器对象(如果是全局变量,程序结束浏览器不会关闭,局部变量会自动关闭)
from selenium.webdriver import Chrome,Firefox
b=Chrome()
b.get('https://www.baidu.com/')
c=Firefox()
c.get('https://www.baidu.com/')
2.2 获取网页源代码
print(b.page_source)
2.3 关闭浏览器
b.close()
3.selenium常规交互
# 1.创建浏览器
b=Chrome()
# 2.打开网页
b.get('https://www.51job.com/')
# 3.获取标签(输入框)
# s_input=b.find_element_by_id('kwdselectid')
search_input=b.find_element_by_css_selector('#kwdselectid')
print(search_input)
# 4.在输入框输入内容
search_input.send_keys('数据分析')
# 输入框中按回车
search_input.send_keys(Keys.ENTER)
# 5.获取网页源代码
print(b.page_source)
# 6.获取下一页对应的标签
next=b.find_element_by_css_selector('.next')
# 7.点击按钮
next.click()
time.sleep(1)
print(b.page_source)
4.selenium常规配置
def get_ip():
response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0]=='{':
print('提取ip失败')
return None
return [x for x in response.text.split('\n') if x]
while True:
ips=get_ip()
if ips:
break
time.sleep(5)
# 1.创建谷歌浏览器的配置对象
options=ChromeOptions()
# 1.1 添加取消测试环境选项
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 1.2 取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 1.3 设置代理
options.add_argument(f'--proxy-server=http://{ips[0]}')
b=Chrome(options=options)
b.get('https://movie.douban.com/top250')
print(b.page_source)
5.获取和保存cookie
def save_cookie():
# 打开浏览器,引导到登录页面
b=Chrome()
b.get('https://taobao.com/')
search_input =b.find_element_by_id('q')
search_input.send_keys('口红')
search_input.send_keys(Keys.ENTER)
# 模拟人工登录
time.sleep(11)
# 获取cooki
cookies=b.get_cookies()
with open('files/taobao.txt', 'w', encoding='utf-8') as f:
f.write(str(cookies))
print(cookies)
save_cookie()
6.爬取淘宝
def get_ip():
response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0]=='{':
print('提取ip失败')
return None
print(response.text)
return [x for x in response.text.split('\n') if x]
while True:
ips=get_ip()
if ips:
break
time.sleep(5)
options=ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument(f'--proxy-server=http://{ips[1]}')
b=Chrome(options=options)
b.get('https://taobao.com/')
# 设置cookie
cookies=eval(open('files/taobao.txt',encoding='utf-8').read())
for cookie in cookies:
if cookie['secure']:
b.add_cookie(cookie)
search_input =b.find_element_by_id('q')
search_input.send_keys('口红')
search_input.send_keys(Keys.ENTER)
print(b.page_source)