python爬虫代理和selenium

python爬虫代理和selenium

1.代理ip的使用

1.1 获取蘑菇代理中的代理ip

def get_ip():
    response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
    if response.text[0]=='{':
        print('提取ip失败')
        return None
    return [x for x in response.text.split('\n') if x]

1.2 使用代理IP

def get_net_data():
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
    }
    ips=get_ip()
    if not ips:
        print('ip获取失败,等10秒后重新获取')
        return
    #     'http':'ip:端口'
    #      'https':'ip:端口'
    proxies={
        'http':ips[0],
        'https':ips[1]
    }
    response=requests.get('https://movie.douban.com/top250',headers=headers,proxies=proxies)
    print(response.text)

1.3 优化代理使用

在代理获取中加一个睡眠,让代理直到获取才结束

import requests
import time

def get_ip():
    response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
    if response.text[0]=='{':
        print('提取ip失败')
        return None
    return [x for x in response.text.split('\n') if x]

def get_net_data():
    # 不断获取ip直到成功
    while True:
        ips=get_ip()
        if ips:
            break
        time.sleep(10)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
    }

    proxies = {
        'http': ips[0],
        'https': ips[1]
    }

    url='https://movie.douban.com/top250'
    response=requests.get(url,headers=headers,proxies=proxies)
    print(response.text)

if __name__ == '__main__':
    get_net_data()

2.selenium的基本功能

2.1 创建浏览器对象(如果是全局变量,程序结束浏览器不会关闭,局部变量会自动关闭)

from selenium.webdriver import Chrome,Firefox
b=Chrome()
b.get('https://www.baidu.com/')
c=Firefox()
c.get('https://www.baidu.com/')

2.2 获取网页源代码

print(b.page_source)

2.3 关闭浏览器

b.close()

3.selenium常规交互

# 1.创建浏览器
b=Chrome()
# 2.打开网页
b.get('https://www.51job.com/')
# 3.获取标签(输入框)
# s_input=b.find_element_by_id('kwdselectid')
search_input=b.find_element_by_css_selector('#kwdselectid')
print(search_input)

# 4.在输入框输入内容
search_input.send_keys('数据分析')
# 输入框中按回车
search_input.send_keys(Keys.ENTER)

# 5.获取网页源代码
print(b.page_source)

# 6.获取下一页对应的标签
next=b.find_element_by_css_selector('.next')

# 7.点击按钮
next.click()

time.sleep(1)
print(b.page_source)

4.selenium常规配置

def get_ip():
    response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
    if response.text[0]=='{':
        print('提取ip失败')
        return None
    return [x for x in response.text.split('\n') if x]

while True:
    ips=get_ip()
    if ips:
        break
    time.sleep(5)

# 1.创建谷歌浏览器的配置对象
options=ChromeOptions()

# 1.1 添加取消测试环境选项
options.add_experimental_option('excludeSwitches', ['enable-automation'])

# 1.2 取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

# 1.3 设置代理
options.add_argument(f'--proxy-server=http://{ips[0]}')

b=Chrome(options=options)
b.get('https://movie.douban.com/top250')
print(b.page_source)

5.获取和保存cookie

def save_cookie():
    # 打开浏览器,引导到登录页面
    b=Chrome()
    b.get('https://taobao.com/')

    search_input =b.find_element_by_id('q')
    search_input.send_keys('口红')
    search_input.send_keys(Keys.ENTER)

    # 模拟人工登录
    time.sleep(11)

    # 获取cooki
    cookies=b.get_cookies()
    with open('files/taobao.txt', 'w', encoding='utf-8') as f:
        f.write(str(cookies))
    print(cookies)

save_cookie()

6.爬取淘宝

def get_ip():
    response=requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
    if response.text[0]=='{':
        print('提取ip失败')
        return None
    print(response.text)
    return [x for x in response.text.split('\n') if x]

while True:
    ips=get_ip()
    if ips:
        break
    time.sleep(5)
options=ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
options.add_argument(f'--proxy-server=http://{ips[1]}')

b=Chrome(options=options)
b.get('https://taobao.com/')

# 设置cookie
cookies=eval(open('files/taobao.txt',encoding='utf-8').read())
for cookie in cookies:
    if cookie['secure']:
        b.add_cookie(cookie)

search_input =b.find_element_by_id('q')
search_input.send_keys('口红')
search_input.send_keys(Keys.ENTER)

print(b.page_source)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值