browserless ws 的处理实际上一个proxy 对于启动的实际无头浏览器ws 服务进行了代理,同时为了安全browserless 进行了token 的处理
以下对于内部实现进行一个简单说明
参考处理
- ws route 注册
browserless 模块中的start 方法
wsRoutes 是实现WebSocketRoute 或者BrowserWebsocketRoute 的子类
BrowserWebsocketRoute 子类如下
- ChromiumCDPWebSocketRoute ws route 定义
可以看到需要一个browser 对象,比对象是属于懒加载的,在需要的时候才会进行创建,对于路由Route 都会有一个BrowserManager 对象实现浏览器的管理,对于不同浏览器的实现,直接传递了实际的browser 实现,比如ChromiumCDPWebSocketRoute 使用的ChromiumCDP
export default class ChromiumCDPWebSocketRoute extends BrowserWebsocketRoute {
name = BrowserlessRoutes.ChromiumCDPWebSocketRoute;
auth = true;
browser = ChromiumCDP;
concurrency = true;
description = `Launch and connect to Chromium with a library like puppeteer or others that work over chrome-devtools-protocol.`;
path = [WebsocketRoutes['/'], WebsocketRoutes.chromium];
tags = [APITags.browserWS];
handler = async (
req: Request,
socket: Duplex,
head: Buffer,
_logger: Logger,
browser: ChromiumCDP,
): Promise<void> => browser.proxyWebSocket(req, socket, head);
}
ChromiumCDP 实现的功能
ChromiumCDP 实现了实际通过websocket 访问浏览器的能力,同时也包含了对无头浏览器的启动管理,详细的可以查看ChromiumCDP 类
浏览器的启动
核心是router 一个websocket 的包装方法
public registerWebSocketRoute(
route: WebSocketRoute | BrowserWebsocketRoute,
): WebSocketRoute | BrowserWebsocketRoute {
this.log.trace(`Registering WebSocket "${route.path}"`);
const bound = route.handler.bind(route);
const wrapped = this.wrapWebSocketHandler(route, bound);
// 此处是一个并发控制
route.handler = route.concurrency
? this.limiter.limit(
wrapped,
this.onQueueFullWebSocket,
this.onWebsocketTimeout,
this.getTimeout,
)
: wrapped; const wrapped = this.wrapWebSocketHandler(route, bound);
wrapWebSocketHandler 的处理
protected wrapWebSocketHandler =
(
route: WebSocketRoute | BrowserWebsocketRoute,
handler: WebSocketRoute['handler'] | BrowserWebsocketRoute['handler'],
) =>
async (req: Request, socket: stream.Duplex, head: Buffer) => {
if (!isConnected(socket)) {
this.log.warn(`WebSocket Request has closed prior to running`);
return Promise.resolve();
}
const logger = new this.logger(route.name, req);
if ('browser' in route && route.browser) {
// 通过browserManager 获取或者创建浏览器
const browser = await this.browserManager.getBrowserForRequest(
req,
route,
logger,
);
if (!isConnected(socket)) {
this.log.warn(`WebSocket Request has closed prior to running`);
this.browserManager.complete(browser);
return Promise.resolve();
}
if (!browser) {
return writeResponse(socket, 500, `Error loading the browser.`);
}
try {
this.log.trace(`Running found WebSocket handler.`);
await handler(req, socket, head, logger, browser);
} finally {
this.log.trace(`WebSocket Request handler has finished.`);
this.browserManager.complete(browser);
}
return;
}
return (handler as WebSocketRoute['handler'])(req, socket, head, logger);
};
getBrowserForRequest 处理
整个代码还是比较长的,核心是基于参数进行浏览器的获取或者创建
public getBrowserForRequest = async (
req: Request,
router: BrowserHTTPRoute | BrowserWebsocketRoute,
logger: Logger,
): Promise<BrowserInstance> => {
const { browser: Browser } = router;
const blockAds = parseBooleanParam(
req.parsed.searchParams,
'blockAds',
false,
);
const decodedLaunchOptions = convertIfBase64(
req.parsed.searchParams.get('launch') || '{}',
);
let parsedLaunchOptions: BrowserServerOptions | CDPLaunchOptions;
// Handle browser re-connects here
if (req.parsed.pathname.includes('/devtools/browser')) {
const sessions = Array.from(this.browsers);
const id = req.parsed.pathname.split('/').pop() as string;
const found = sessions.find(([b]) =>
b.wsEndpoint()?.includes(req.parsed.pathname),
);
if (found) {
const [browser, session] = found;
++session.numbConnected;
this.log.debug(`Located browser with ID ${id}`);
return browser;
}
throw new NotFound(
`Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
);
}
// Handle page connections here
if (req.parsed.pathname.includes('/devtools/page')) {
const id = req.parsed.pathname.split('/').pop() as string;
if (!id.includes(BLESS_PAGE_IDENTIFIER)) {
const browsers = Array.from(this.browsers).map(([browser]) => browser);
const allPages = await Promise.all(
browsers
.filter((b) => !!b.wsEndpoint())
.map(async (browser) => {
const { port } = new URL(
browser.wsEndpoint() as unknown as string,
);
const response = await fetch(
`http://127.0.0.1:${port}/json/list`,
{
headers: {
Host: '127.0.0.1',
},
},
).catch(() => ({
json: () => Promise.resolve([]),
ok: false,
}));
if (response.ok) {
const body = await response.json();
// @ts-ignore
return body.map((b) => ({ ...b, browser }));
}
return null;
}),
);
const found = allPages.flat().find((b) => b.id === id);
if (found) {
const session = this.browsers.get(found.browser)!;
++session.numbConnected;
return found.browser;
}
throw new NotFound(
`Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
);
}
}
try {
parsedLaunchOptions = JSON.parse(decodedLaunchOptions);
} catch (err) {
throw new BadRequest(
`Error parsing launch-options: ${err}. Launch options must be a JSON or base64-encoded JSON object`,
);
}
const routerOptions =
typeof router.defaultLaunchOptions === 'function'
? router.defaultLaunchOptions(req)
: router.defaultLaunchOptions;
const launchOptions = {
...routerOptions,
...parsedLaunchOptions,
};
const manualUserDataDir =
launchOptions.args
?.find((arg) => arg.includes('--user-data-dir='))
?.split('=')[1] || (launchOptions as CDPLaunchOptions).userDataDir;
// Always specify a user-data-dir since plugins can "inject" their own
// unless it's playwright which takes care of its own data-dirs
const userDataDir =
manualUserDataDir ||
(!this.playwrightBrowserNames.includes(Browser.name)
? await generateDataDir(undefined, this.config)
: null);
const proxyServerArg = launchOptions.args?.find((arg) =>
arg.includes('--proxy-server='),
);
/**
* If it is a playwright request
*/
if (
launchOptions.args &&
proxyServerArg &&
req.parsed.pathname.startsWith('/playwright')
) {
(launchOptions as BrowserServerOptions).proxy = {
server: proxyServerArg.split('=')[1],
};
const argIndex = launchOptions.args.indexOf(proxyServerArg);
launchOptions.args.splice(argIndex, 1);
}
const browser = new Browser({
blockAds,
config: this.config,
logger,
userDataDir,
});
const session: BrowserlessSession = {
id: null,
initialConnectURL:
path.join(req.parsed.pathname, req.parsed.search) || '',
isTempDataDir: !manualUserDataDir,
launchOptions,
numbConnected: 1,
resolver: noop,
routePath: router.path,
startedOn: Date.now(),
ttl: 0,
userDataDir,
};
this.browsers.set(browser, session);
const match = (req.headers['user-agent'] || '').match(pwVersionRegex);
const pwVersion = match ? match[1] : 'default';
// 启动浏览器
await browser.launch(launchOptions as object, pwVersion);
// 执行hooks ,后边介绍下
await this.hooks.browser({ browser, meta: req.parsed });
browser.on('newPage', async (page) => {
await this.onNewPage(req, page);
(router.onNewPage || noop)(req.parsed || '', page);
});
return browser;
};
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
说明
browserless 对于ws 的处理实际上就是ws proxy 对于浏览器的管理是基于了BrowserManager,同时为了确保稳定基于了queue 队列实现了
限速处理,以上是一个简单说明,通过此可以简单了解内部处理
参考资料
src/browsers/chrome.cdp.ts
src/browsers/chromium.cdp.ts
src/router.ts
src/browsers/index.ts
src/browserless.ts
https://docs.browserless.io/open-api#tag/Browser-WebSocket-APIs
https://github.com/berstend/puppeteer-extra/tree/master/packages/playwright-extra
https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra