day3-代理和selenium
1.代理 ip 的使用
(1) 获取蘑菇代理中的代理 IP
import requests
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
def get_net_data():
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
ips = get_ip()
if not ips:
print('ip获取失败,等10s以后重新运行')
return
proxies = {
'http': ips[0],
'https': ips[1]
}
response = requests.get('https://movie.douban.com/top250', headers=headers, proxies=proxies)
print(response.text)
if __name__ == '__main__':
print(get_ip())
2.使用代理的优化程序
import requests
import time
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
def get_net_data():
while True:
ips = get_ip()
if ips:
break
time.sleep(5)
print('ip获取成功:', ips)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
proxies = {
'http': ips[0],
'https': ips[1]
}
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=headers, proxies=proxies)
print(response.text)
get_net_data()
3.selenium 的基本功能
from selenium.webdriver import Chrome
b = Chrome()
b.get('https://www.51job.com/')
print(b.page_source)
4.selenium 的常规交互
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
b = Chrome()
b.get('https://www.51job.com/')
search_input = b.find_element_by_css_selector('#kwdselectid')
search_input.send_keys('数据分析')
search_input.send_keys(Keys.ENTER)
print(b.page_source)
print('--------------------------------------------------------')
next = b.find_element_by_css_selector('.next')
next.click()
print('+++++++++++++++++++++++++++++')
time.sleep(1)
print(b.page_source)
5.selenium 的常用配置
from selenium.webdriver import Chrome, ChromeOptions
import requests
import time
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
while True:
ips = get_ip()
if ips:
break
time.sleep(1)
print(ips)
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument(f'--proxy-server=http://{ips[0]}')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250')
print(b.page_source)
6.爬淘宝
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = Chrome(options=options)
b.get('https://www.taobao.com')
cookies = eval(open('files/taobao.txt', encoding='utf-8').read())
for cookie in cookies:
if cookie['secure']:
b.add_cookie(cookie)
b.get('https://www.taobao.com')
search_input = b.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)
7.获取和保存cookie值
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
import time
def save_cookie():
b = Chrome()
b.get('https://www.taobao.com')
search_input = b.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)
input('是否继续:')
cookies = b.get_cookies()
f = open('files/taobao.txt', 'w', encoding='utf-8')
f.write(str(cookies))
f.close()
save_cookie()