python爬虫利器 pyppeteer(模拟浏览器) 实战

现在有新方法可以搞定这个模拟登陆了,不会被检测出来,可以完美的绕过对window.navigator.webdriver的检测,pyppeteer是个好东西!
需要用到的python包:asyncio、pyppeteer
友情提醒一下,第一次运行pyppeteer的会下载chromium,速度很慢慢慢慢,还有可能失败。务必耐心等待!!!然后,这个pyppeteer对网速和电脑运行速度还有一定的要求

下载提示信息(不显示下载进度可以kill掉再run):

[W:pyppeteer.chromium_downloader] start chromium download.
Download may take a few minutes.

如果在centos上使用,需要安装下面的依赖

yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y
browser = await launch();//启动浏览器实例
page = await browser.newPage();//打开一个空白页
await page.goto('https://example.com');//在地址栏输入网址并等待加载
await page.screenshot({path: 'example.png'});//截个图
await page.setUserAgent(
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299')
await page.goto('https://www.baidu.com')
await page.type('#kw', keyword, {'delay': 100}) 获取输入框焦点并输入文字{'delay': input_time_random() - 50}
await page.click('#su')
await page.waitForNavigation({
      waitUntil: 'load'});//等待页面加载出来,等同于window.onload
await browser.close();//关掉浏览器
content = await page.content()
cookies = await page.cookies()
await page.evaluate(js1)   //在网页中执行js代码
await page.keyboard.press 模拟键盘按下某个按键,目前mac上组合键无效为已知bug
page.waitFor 页面等待,可以是时间、某个元素、某个函数
page.url 获得当前访问的url
await get_cookie(page)
page.frames() 获取当前页面所有的 iframe,然后根据 iframe 的名字精确获取某个想要的 iframe
iframe.$('.srchsongst') 获取 iframe 中的某个元素
iframe.evaluate() 在浏览器中执行函数,相当于在控制台中执行函数,返回一个 Promise
Array.from 将类数组对象转化为对象
page.click() 点击一个元素
iframe.$eval() 相当于在 iframe 中运行 document.queryselector 获取指定元素,并将其作为第一个参数传递
iframe.$$eval 相当于在 iframe 中运行 document.querySelectorAll 获取指定元素数组,并将其作为第一个参数传递
#await page.waitFor(10000)
url_elements = await page.querySelectorAll('.result h3 a')
for url_element in url_elements:
    url = await page.evaluate('(url_element) => url_element.href', url_element)
    url_queue.put(url)

爬取网页代码

import asyncio
import  pyppeteer
import os

os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
pyppeteer.DEBUG = True

async def main():
    print("in main ")
    print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
    browser = await pyppeteer.launch()
    page = await browser.newPage()
    await page.goto('http://www.baidu.com')
    
    content = await page.content()
    cookies = await page.cookies()
    # await page.screenshot({'path': 'example.png'})
    await browser.close()
    return {'content':content, 'cookies':cookies}

loop = asyncio.get_event_loop()
task = asyncio.ensure_future(main())
loop.run_until_complete(task)

print(task.result())

与scrapy的整合

加入downloadmiddleware

from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
import pyppeteer
import asyncio
import os
from scrapy.http import HtmlResponse

pyppeteer.DEBUG = False 

class FundscrapyDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    def __init__(self) :
        print("Init downloaderMiddleware use pypputeer.")
        os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
        # pyppeteer.DEBUG = False
        print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
        loop = asyncio.get_event_loop()
        task = asyncio.ensure_future(self.getbrowser())
        loop.run_until_complete(task)

        #self.browser = task.result()
        print(self.browser)
        print(self.page)
        # self.page = await browser.newPage()
    async def getbrowser(self):
        self.browser = await pyppeteer.launch()
        self.page = await self.browser.newPage()
        # return await pyppeteer.launch()
    async def getnewpage(self): 
        return  await self.browser.newPage()

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        loop = asyncio.get_event_loop()
        task = asyncio.ensure_future(self.usePypuppeteer(request))
        loop.run_until_complete(task)
        # return task.result()
        return HtmlResponse(url=request.url, body=task.result(), encoding="utf-8",request=request)

    async def usePypuppeteer(self, request):
        print(request.url)
        # page = await self.browser.newPage()
        await self.page.goto(request.url)
        content = await self.page.content()
        return content 

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

 

  • 3
    点赞
  • 41
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值