python爬虫 aiohttp实现并发和selenium模块

最新推荐文章于 2023-09-22 13:40:31 发布

CrazyDemo

最新推荐文章于 2023-09-22 13:40:31 发布

阅读量981

点赞数

分类专栏： python # python爬虫

本文链接：https://blog.csdn.net/qq_30307045/article/details/107969323

版权

python 同时被 2 个专栏收录

24 篇文章 0 订阅

订阅专栏

python爬虫

9 篇文章 0 订阅

订阅专栏

利用单线程+多任务协程

协程
- 如果一个函数的定义被asyic修饰后，则该函数调用后会返回一个协程对象
任务对象：
- 就是对协程对象的进一步封装
绑定回调
- -task.add_done_callback(func)；func(task)；task.result()
事件循环对象
- 时间循环对象是用来装载任务对象。该对象被启动后，则会异步的处理调用其内部装载的每一个任务对象，（将任务对象手动进行挂起操作）
- -aynic，await
- 注意事项：在特殊函数内部不可以出现不支持异步模块的代码，否则会中断整个异步效果
aiohttp支持异步请求的模块

import aiohttp
import asyncio
from lxml import etree

urls = [
    'http://127.0.0.1:8000/app_01/show/',
    'http://127.0.0.1:8000/app_01/job/',
    'http://127.0.0.1:8000/app_01/exec/',
]


# 特殊的函数：请求发送和响应数据的捕获
# 细节：在每一个with前加async，在每一个阻塞操作前加上await
async def get_request(url):
    async with aiohttp.ClientSession() as s:
        # with s.get(url, headers, proxy="http://ip:port",params)
        async with await s.get(url) as response:
            page_text = await response.text()  # read()返回的是bytes类型数据
            return page_text


# 回调函数
def parse(task):
    page_text = task.result()
    tree = etree.HTML(page_text)
    tr_text = tree.xpath('/html/body/table/tbody/tr')
    print(tr_text)


tasks = []
for url in urls:
    c = get_request(url)
    task = asyncio.ensure_future(c)
    task.add_done_callback(parse)
    tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

selenium模块在爬虫中的使用

概念：是一个基于浏览器自动化的模块
爬虫之间的关联：
- 便捷的捕获到动态加载到的数据（可见即可得）
- 实现模拟登录
环境安装：pip install selenium
基本使用
- 准备好某一款浏览器的驱动程序
- 实例化某一款浏览器对象
动作链
- 一系列连续的动作
- 在实现标签定位时，如果发现定位的标签是存在于iframe标签之中的，则在定位时必须执行一个固定的操作：bro.switch_to.frame('id')
无头浏览器的操作：无可视化界面的浏览器
- PhantomJs：停止更新了
- 谷歌无头浏览器
规避检测
- window.navigator.webdriver 会检测是否为selenium发送的请求

selenium基本使用

from selenium import webdriver
from time import sleep

bro = webdriver.Chrome(executable_path="chromedriver.exe")      # 指定驱动位置
bro.get('https://www.jd.com/')
sleep(1)
# 进行标签定位
search_input = bro.find_element_by_id("key")
search_input.send_keys("Mac pro")

btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()     # 点击操作
sleep(2)
# 执行js
bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(2)

page_text = bro.page_source
print(page_text)
sleep(2)
bro.quit()

selenium动作链使用

from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains

bro = webdriver.Chrome(executable_path="chromedriver.exe")
bro.get("https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable")
bro.switch_to.frame("iframeResult")
div_tag = bro.find_element_by_id("draggable")

# 滑动=点击+滑动
action = ActionChains(bro)
action.click_and_hold(div_tag)

for i in range(5):
    # perform 让动作链立即执行
    action.move_by_offset(15, 10).perform()
    sleep(0.5)

action.release()    # 释放资源
sleep(2)
bro.quit()

爬个动态加载数据的网站

from selenium import webdriver
from time import sleep
from lxml import etree

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('http://125.35.6.84:81/xk/')
sleep(2)

page_text = bro.page_source
page_list = [page_text]

for i in range(3):
    bro.find_element_by_id("pageIto_next").click()
    sleep(2)
    page_list.append(bro.page_source)

bro.quit()

for page in page_list:
    tree = etree.HTML(page)
    li_list = tree.xpath('//*[@id="gzlist"]/li')
    for li in li_list:
        title = li.xpath("./dl/@title")[0]
        id = li.xpath("./ol/@title")[0]
        print(id, ":", title)

12306模拟登录——图片点击

from selenium import webdriver
from time import sleep
from PIL import Image
from chaojiying_Python.chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions


def tranformImageData(img_path, t_type):
    chaojiying = Chaojiying_Client('xxx', 'xxx', '1004')
    im = open(img_path, 'rb').read()
    return chaojiying.PostPic(im, t_type)['pic_str']


option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = Chrome(options=option)

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get("https://kyfw.12306.cn/otn/login/init")
sleep(1)
bro.save_screenshot("main.png")

code_img = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = code_img.location    # 坐下角原点
size = code_img.size            # 宽和高
# 裁剪的区域范围，左下角，右上角
rangle = (int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))

i = Image.open("main.png")
frame = i.crop(rangle)
frame.save('code.png')

image_location = tranformImageData("code.png", 9004)
print(image_location)
# image_location = "114,58|185,82|259,66"
image_list = image_location.split("|")
result = []     # [['114', '58'], ['185', '82']]
for i in image_list:
    x_y_list = i.split(",")
    result.append(x_y_list)

new_result = []    # [[114, 58], [185, 82]]
for i in result:
    j = [int(k) for k in i]
    new_result.append(j)

action = ActionChains(bro)
for a in new_result:
    x = a[0]
    y = a[1]
    action.move_to_element_with_offset(code_img, x, y).click().perform()        # 先偏移到图片位置，再进行点击
    sleep(1)

无头浏览器设置

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

# 设置无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

driver = webdriver.Chrome(executable_path='chromedriver.exe', options=chrome_options)
driver.get("https://www.baidu.com")
print(driver.page_source)

规避selenium检测

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

# 新老版本google兼容
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path='./chromedriver')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
driver.get('http://www.baidu.com')

CrazyDemo

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬虫 aiohttp实现并发和selenium模块

利用单线程+多任务协程协程如果一个函数的定义被asyic修饰后，则该函数调用后会返回一个协程对象任务对象：就是对协程对象的进一步封装绑定回调 -task.add_done_callback(func)；func(task)；task.result() 事件循环对象时间循环对象是用来装载任务对象。该对象被启动后，则会异步的处理调用其内部装载的每一个任务对象，（将任务对象手动进行挂起操作） -aynic，await 注意事项：在特殊函数内部不可以出现不支持异步模.
复制链接

扫一扫