python - pychrome 页面抓取测试
Max.Bai
2019.06
记录一下去年做页面抓取的脚本记录。
使用Chrome-headless抓取页面内容,使用python 的pychrome包。
要先开启浏览器
然后通过pychrome调用chrome dev protocol
#! python3
# _*_ coding:utf-8 _*_
__author__ = 'Max.Bai'
__date__ = '2018.06'
import pychrome
import threadpool
import threading
ticket_lock = threading.Lock()
p_lock = threading.Lock()
import time
# start chrome first
# "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" --headless --remote-debugging-port=9222 --disable-gpu --remote-debugging-address=0.0.0.0
# google-chrome --no-sandbox --headless --remote-debugging-port=9222 --user-data-dir=/home/tools/chrome/temp/ --remote-debugging-address=0.0.0.0 --disable-gpu
class BrowserManager(object):
_tab_pool = {}
_browsers = {}
@staticmethod
def add_browser(host, port, tab_count=5):
browser_key = "{}:{}".format(host.lower(), port)
if browser_key in BrowserManager._browsers:
br = BrowserManager._browsers[browser_key]
else:
browser_url = "http://{}:{}".format(host.lower(), port)
br = pychrome.Browser(url=browser_url)
BrowserManager._browsers[browser_key] = br
tabs = br.list_tab(5)
if tab_count > len(tabs):
for i in range(0, (tab_count - len(tabs))):
br.new_tab()
br.list_tab(5)
BrowserManager._tab_pool.update(br