Day023 - requests代理与selenium

selenium

selenium基本用法

from selenium.webdriver import Chrome, Edge
  1. 创建浏览器对象
web = Edge()
  1. 输入网址
web.get('https://www.gamersky.com')
  1. 获取网页源代码
web.page_source
  1. 关闭浏览器
web.close()

常规交互

from selenium.webdriver import Edge
from selenium.webdriver.common.keys import Keys
from time import sleep
  • 获取标签(输入框)
in_put = web.find_element_by_id('kwdselectid')  # 根据id查找
# in_put = web.find_elements_by_css_selector('kwdselectid')
print(in_put)
  • 在输入框中输入内容
in_put.send_keys('数据分析')  # 输入内容
in_put.send_keys(Keys.ENTER)  # 在输入框中按回车
  • 获取网页数据
web.page_source
  • 获取下一页标签
next = web.find_element_by_css_selector('.next')
  • 点击按钮
next.click()
sleep(2)
print(web.page_source)

selenium常用配置

添加配置

  • 创建谷歌浏览器配置对象
options = ChromeOptions()
  • 添加取消测试环境选项
options.add_experimental_option('excludeSwitches', ['enable-automation'])
  • 取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
  • 设置代理
options.add_argument(f'--proxy-server=http://代{ips[randint(0, 3)]}')

创建网页对象

web = Chrome(options=options)
web.get('https://movie.douban.com/top250')

print(web.page_source)

爬淘宝数据

from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys

options = ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

web = Chrome(options=options)
web.get('https://www.taobao.com/')

# 设置Cookies
cookies = eval(open('cookies.txt', encoding='utf-8').read())
for cookie in cookies:
    if cookie['secure']:
        web.add_cookie(cookie)

web.get('https://www.taobao.com/')
search_input = web.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)

爬取51job职位数据

import time, re, csv
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup


def config():
    """
    生成浏览器配置
    :return:
    """
    options = ChromeOptions()
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    return options


def web_config():
    web.get('https://www.51job.com/')
    in_put = web.find_element_by_id('kwdselectid')
    in_put.send_keys('数据分析')
    in_put.send_keys(Keys.ENTER)


if __name__ == '__main__':
    # 创建文件保存最终数据
    f = open('files/51job1.csv', 'a', encoding='utf-8', newline='')
    writer = csv.writer(f)
    web = Chrome(options=config())
    web_config()
    next = web.find_element_by_class_name('next')
    # 获取最大页数
    page = re.findall(r'\d{2,}', web.find_element_by_class_name('tright').text)
    # for x in range(int(page[0])):
    for x in range(2):
        print(x)
        info = web.page_source
        soup = BeautifulSoup(info, 'lxml')
        assign_info = soup.select('.j_joblist>.e')
        tmp = []
        for eve_info in assign_info:
            name = eve_info.select_one('.jname').get_text()
            salary = eve_info.select_one('.info > .sal').get_text()
            company = eve_info.select_one('.er > a').get_text()
            link = eve_info.select_one('a').attrs['href']
            tmp.extend([name, salary, company, link])
            writer.writerow(tmp)
            tmp.clear()
        next.click()
        time.sleep(3)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值