sockjs-client 使用websocket方式传输发生消息丢失
问题背景
使用sockjs发送大批量信息时,在服务端发现大量数据出现丢失,在客户端发送60w行消息时,平均服务端只能收到8w行,消息的丢失率达到86%,以下是该问题的分析流程
本地复现
创建一个简单的client与server,制造一个内容较多的文件large.log,执行cat large.log将输出的可读流,通过client发送到服务端,并查看消息的丢失情况.
制造大批量消息
generate-large-log.js
let fs = require('fs');
let os = require('os');
let fWriteName = './large.log';
let fWrite = fs.createWriteStream(fWriteName);
const line = row =>
`This is line ${row}. The purpose of writing this is to test whether the large log file can be transferred to the server normally. Okay, now the ${row} line is over.`
for (let i = 0; i < 600000; i++) {
fWrite.write(line(i) + os.EOL)
}
在命令行执行:
node generate-large-log.js
客户端代码
client.js
const SockJS = require('sockjs-client')
const fs = require('fs')
const path = require('path')
const fileName = path.resolve(__dirname, 'large.log');
let sock = new SockJS('http://127.0.0.1:9999/echo', null, {
transports: 'websocket'
});
let no = 0;
let readStream;
sock.onopen = function () {
console.log('open');
readStream = fs.createReadStream(fileName)
readStream.on('data', (chunk) => {
console.log('send msg----------------')
sock.send(`no:${no++},send:${new Date()},${chunk.toString()}`)
})
// 监听读取结束
readStream.on('end', () => {
sock.send('copy done');
console.log(`send done:${no}`);
})
};
sock.onmessage = function (e) {
// console.log('message', e.data);
};
sock.onclose = function () {
console.log(`close: ${no}`);
process.exit()
};
服务端代码
server.js
'use strict';
const http = require('http');
const express = require('express');
const sockjs = require('sockjs');
var lastMsg = "";
var interval = null;
const sockjs_opts = {
prefix: '/echo',
transports: [ 'websocket', 'websocket-raw' ]
};
const sockjs_echo = sockjs.createServer(sockjs_opts);
sockjs_echo.on('connection', (conn) => {
console.log('connected')
var firstData = true;
interval = setInterval(() => {
console.log(`last Msg:${lastMsg.substring(0, 300)}.........${lastMsg.substring(lastMsg.length - 300, lastMsg.length)}`)
}, 1000);
conn.on('data', (msg) => {
if (firstData) {
console.log('msg:', msg);
firstData = false;
}
lastMsg = `received:${new Date()},len:${msg.length},${msg}`;
conn.write(msg);
});
conn.on('close', () => {
console.log('lastMsg:', lastMsg)
if (interval) clearInterval(interval)
console.log('connect close')
})
});
const app = express();
app.get('/', (req, res) => {
});
const server = http.createServer(app);
sockjs_echo.installHandlers(server);
server.listen(9999, '0.0.0.0', () => {
console.log(' [*] Listening on 0.0.0.0:9999');
});
为了可以分析client发出的数据包是否被服务接收到,需要进行抓包分析,为了方便抓包,我们将server.js部署到容器中,对应的文件如下:
FROM node:latest
COPY node_modules /code/node_modules
COPY package.json /code/
COPY server.js /code/
RUN chmod -R 0777 /code
USER node
CMD node /code/server.js
在命令行执行
docker build -f server.dockerfile -t sockjs-server:v1 .
docker-compose.yaml
services:
sockjs-server:
image: sockjs-server:v1
container_name: sockjs-server
ports:
- '9999:9999'
environment:
- DEBUG=*
deploy:
resources:
limits:
cpus: '0.1'
memory: 2G
pull_policy: never
extra_hosts:
- "host.docker.internal:host-gateway"
在命令行执行
docker-compose up -dt0
执行docker logs -f sockjs-server
查看日志
Thu, 02 Nov 2023 02:49:26 GMT express:application set "x-powered-by" to true
Thu, 02 Nov 2023 02:49:26 GMT express:application set "etag" to 'weak'
Thu, 02 Nov 2023 02:49:26 GMT express:application set "etag fn" to [Function: generateETag]
Thu, 02 Nov 2023 02:49:26 GMT express:application set "env" to 'development'
Thu, 02 Nov 2023 02:49:26 GMT express:application set "query parser" to 'extended'
Thu, 02 Nov 2023 02:49:26 GMT express:application set "query parser fn" to [Function: parseExtendedQueryString]
Thu, 02 Nov 2023 02:49:26 GMT express:application set "subdomain offset" to 2
Thu, 02 Nov 2023 02:49:26 GMT express:application set "trust proxy" to false
Thu, 02 Nov 2023 02:49:26 GMT express:application set "trust proxy fn" to [Function: trustNone]
Thu, 02 Nov 2023 02:49:26 GMT express:application booting in development mode
Thu, 02 Nov 2023 02:49:26 GMT express:application set "view" to [Function: View]
Thu, 02 Nov 2023 02:49:26 GMT express:application set "views" to '/views'
Thu, 02 Nov 2023 02:49:26 GMT express:application set "jsonp callback name" to 'callback'
Thu, 02 Nov 2023 02:49:26 GMT express:router use '/' query
Thu, 02 Nov 2023 02:49:26 GMT express:router:layer new '/'
Thu, 02 Nov 2023 02:49:26 GMT express:router use '/' expressInit
Thu, 02 Nov 2023 02:49:26 GMT express:router:layer new '/'
Thu, 02 Nov 2023 02:49:26 GMT express:router:route new '/'
Thu, 02 Nov 2023 02:49:26 GMT express:router:layer new '/'
Thu, 02 Nov 2023 02:49:26 GMT express:router:route get '/'
Thu, 02 Nov 2023 02:49:26 GMT express:router:layer new '/'
SockJS v0.3.24 bound to "/echo"
[*] Listening on 0.0.0.0:9999
复现过程
根据上面的流程服务端已经打好镜像并且启动,现在执行node client.js
查看对应日志:
open
send msg----------------
send msg----------------
send msg----------------
send msg----------------
......
......
......
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send msg----------------
send done:1508
server端日志:
从server端可以发现期望接收到599999行消息,但server只接收到7w行,到此sockjs发送消息的问题已在本地成功复现,下面开始对出现消息丢失的原因进行分析.
问题分析
首先使用tcpdump对sockjs-server容器的网卡进行抓包.
获取容器网卡
PID=$(docker inspect sockjs-server --format {{.State.Pid}})
sudo nsenter -n -t $PID ip addr
产生日志如下:
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
31: eth0@if32: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default
link/ether xx:xx:xx:xx:xx:xx brd ff:ff:ff:ff:ff:ff link-netnsid 0
inet 172.24.0.2/16 brd 172.24.255.255 scope global eth0
valid_lft forever preferred_lft forever
执行 ip addr
找到排在第32位置的网卡,粘贴对应的网卡名称,比如veth894b508
抓包分析
sudo tcpdump -i veth894b508 -w sockjs-server-log.pcap
再次执行client.js,中止tcpdump命令,将生成的pcap文件导入wireshark进行分析从右侧栏目中可以看到,sockjs-server的网卡接收到了大量的TCP ZeroWindow
,这说明服务端的接收窗口已经满了,这时候服务端就会告诉客户端"求求你不要在发了,处理不过来了",客户端接收到服务端win=0的信息后,就会暂停发送,直到服务端发送窗口更新的信息给客户端,这时客户端就会继续发送待发送的tcp包.从wireshark的Statistics/Flow Graph可以更清晰的了解双方发送的信息.如下图所示
在传输列表栏,找到在最后一次协议为websocket的包,单击查看详细信息,在WebSocket的Payload中可以看到解码后最后一次发往server端的消息.在最后一次发到了第10w行的消息.再此之后就在无消息被捕捉到了.
到此为止,问题没有减少反而越来越多,websocket底层不是使用的tcp吗?既然是tcp为什么会出现只有一部分消息被发送给服务端?剩下的大部分消息去哪了?tcp到底靠不靠谱??
我们现在梳理一下,通过抓包分析我们可以知道,大部分的丢失的消息并没有经过容器的网卡,也就是丢包这个动作,只可能发生在client端的应用程序到server的传输层之间.下面我缩小一下范围,首先我们分析client端的传输层到server端的传输层,有没有出现大量丢数据的行为.关于排查的方法与过程,可以参考下面的两篇文章,这里就不详细展开.
[1] 【修正版】动图图解!代码执行send成功后,数据就发出去了吗?
通过上面的方法排查发现大部分在client端的传输层到server端的传输层没有发生大量数据丢失的事件.所以基本可以确定消息的丢失发生在应用程序到client传输层.下面我们从代码层面开始分析.
代码分析
我们回到本地复现中的client.js代码中,下面我们分析一下在sock.send之后到底发生了啥?
node_modules/sockjs-client/lib/main.js:155
SockJS.prototype.send = function (data) {
// #13 - convert anything non-string to string
// TODO this currently turns objects into [object Object]
if (typeof data !== 'string') {
data = '' + data;
}
if (this.readyState === SockJS.CONNECTING) {
throw new Error('InvalidStateError: The connection has not been established yet');
}
if (this.readyState !== SockJS.OPEN) {
return;
}
this._transport.send(escape.quote(data));
};
send函数对消息的类型,信道的状态进行了判断处理,然后调用this._transport.send(escape.quote(data))
下面我们看一下_transport
到底是啥在哪定义的.我们在文件内进行搜索在_connect下找到了对应的赋值语句.
SockJS.prototype._connect = function () {
for (var Transport = this._transports.shift(); Transport; Transport = this._transports.shift()) {
debug('attempt', Transport.transportName);
if (Transport.needBody) {
if (!global.document.body ||
(typeof global.document.readyState !== 'undefined' &&
global.document.readyState !== 'complete' &&
global.document.readyState !== 'interactive')) {
debug('waiting for body');
this._transports.unshift(Transport);
eventUtils.attachEvent('load', this._connect.bind(this));
return;
}
}
// calculate timeout based on RTO and round trips. Default to 5s
var timeoutMs = Math.max(this._timeout, (this._rto * Transport.roundTrips) || 5000);
this._transportTimeoutId = setTimeout(this._transportTimeout.bind(this), timeoutMs);
debug('using timeout', timeoutMs);
var transportUrl = urlUtils.addPath(this._transUrl, '/' + this._server + '/' + this._generateSessionId());
var options = this._transportOptions[Transport.transportName];
debug('transport url', transportUrl);
var transportObj = new Transport(transportUrl, this._transUrl, options);
transportObj.on('message', this._transportMessage.bind(this));
transportObj.once('close', this._transportClose.bind(this));
transportObj.transportName = Transport.transportName;
this._transport = transportObj;
return;
}
this._close(2000, 'All transports failed', false);
};
this._transport 与 transportObj有关,transportObj与Transport有关,
Transport 在此被赋值
Transport = this._transports.shift()
_transports在_receiveInfo函数被赋值
var enabledTransports = transports.filterToEnabled(this._transportsWhitelist, info);
this._transports = enabledTransports.main;
transport在module.exports处被赋值
module.exports = function (availableTransports) {
transports = transport(availableTransports);
require('./iframe-bootstrap')(SockJS, availableTransports);
return SockJS;
};
node_modules/sockjs-client/lib/utils/transport.js:8
module.exports = function(availableTransports) {
return {
filterToEnabled: function(transportsWhitelist, info) {
var transports = {
main: []
, facade: []
};
if (!transportsWhitelist) {
transportsWhitelist = [];
} else if (typeof transportsWhitelist === 'string') {
transportsWhitelist = [transportsWhitelist];
}
availableTransports.forEach(function(trans) {
if (!trans) {
return;
}
if (trans.transportName === 'websocket' && info.websocket === false) {
debug('disabled from server', 'websocket');
return;
}
if (transportsWhitelist.length &&
transportsWhitelist.indexOf(trans.transportName) === -1) {
debug('not in whitelist', trans.transportName);
return;
}
if (trans.enabled(info)) {
debug('enabled', trans.transportName);
transports.main.push(trans);
if (trans.facadeTransport) {
transports.facade.push(trans.facadeTransport);
}
} else {
debug('disabled', trans.transportName);
}
});
return transports;
}
};
};
这里的availableTransports
是从main.js的export中传过来的,main.js中的module.exports = function (availableTransports)
是从他的调用者传过来的.
node_modules/sockjs-client/lib/entry.js
'use strict';
var transportList = require('./transport-list');
module.exports = require('./main')(transportList);
// TODO can't get rid of this until all servers do
if ('_sockjs_onload' in global) {
setTimeout(global._sockjs_onload, 1);
}
node_modules/sockjs-client/lib/transport-list.js
'use strict';
module.exports = [
// streaming transports
require('./transport/websocket')
, require('./transport/xhr-streaming')
, require('./transport/xdr-streaming')
, require('./transport/eventsource')
, require('./transport/lib/iframe-wrap')(require('./transport/eventsource'))
// polling transports
, require('./transport/htmlfile')
, require('./transport/lib/iframe-wrap')(require('./transport/htmlfile'))
, require('./transport/xhr-polling')
, require('./transport/xdr-polling')
, require('./transport/lib/iframe-wrap')(require('./transport/xhr-polling'))
, require('./transport/jsonp-polling')
];
至此我们找到了transport.js中availableTransports的源头,在这里,我们使用的是websocket,
所以在transport.js中的transports.main.push(trans);
其中的trans就是transport-list.js中的require('./transport/websocket')
,所以main.js中的this._transport
就是transportObj
就是Transport
就是this._transports = enabledTransports.main;
就是require('./transport/websocket')
所以我们进入其中查看他的send方法实现
node_modules/sockjs-client/lib/transport/websocket.js主要代码如下:
var utils = require('../utils/event')
, urlUtils = require('../utils/url')
, inherits = require('inherits')
, EventEmitter = require('events').EventEmitter
, WebsocketDriver = require('./driver/websocket')
;
...
...
...
this.ws = new WebsocketDriver(this.url, [], options);
...
...
...
WebSocketTransport.prototype.send = function (data) {
var msg = '[' + data + ']';
debug('send', msg);
this.ws.send(msg);
};
node_modules/sockjs-client/lib/transport/driver/websocket.js
module.exports = require('faye-websocket').Client;
继续往下找,黎明就在眼前
node_modules/faye-websocket/lib/faye/websocket/client.js
'use strict';
var util = require('util'),
net = require('net'),
tls = require('tls'),
url = require('url'),
driver = require('websocket-driver'),
API = require('./api'),
Event = require('./api/event');
...
...
...
var Client = function(_url, protocols, options) {
options = options || {};
this.url = _url;
this._driver = driver.client(this.url, { maxLength: options.maxLength, protocols: protocols });
...
...
...
API.call(this, options);
};
util.inherits(Client, API);
...
...
module.exports = Client;
client.js没有send方法,但是他继承了./api,我们去api下去找找看,果然发现了send方法的定义语句.
node_modules/faye-websocket/lib/faye/websocket/api.js
send: function(data) {
if (this.readyState > API.OPEN) return false;
if (!(data instanceof Buffer)) data = String(data);
return this._driver.messages.write(data);
},
而this._driver.messages
定义在 node_modules/websocket-driver/lib/websocket/streams.js中
var Messages = function (driver) {
this.readable = this.writable = true;
this._paused = false;
this._driver = driver;
};
util.inherits(Messages, Stream);
// The Messages pause() and resume() methods will be called when the app that's
// processing the messages gets backed up and drains. If we're emitting
// messages too fast we should tell the source to slow down. Message output [2]
// comes from IO input [1].
Messages.prototype.pause = function () {
this._driver.io._paused = true;
};
Messages.prototype.resume = function () {
this._driver.io._paused = false;
this._driver.io.emit('drain');
};
// When we receive messages from the user, send them to the formatter and tell
// the source whether to back off.
Messages.prototype.write = function (message) {
if (!this.writable) return false;
if (typeof message === 'string') this._driver.text(message);
else this._driver.binary(message);
return !this._paused;
};
呕吼,重点来了,messages继承于node的stream流包,并且定义了pause,resume方法,维护了drain事件的触发,实现了write方法并且返回值为!this.paused
,这摆明了是用来解决背压问题的呀!
node中的背压问题
什么是背压,我们假设这样一个场景,我们构建了一个可读流向一个可写流中不停的写数据.假设可读流数据产生的速度大于可写流处理数据的速度,这样数据就会在可写流的buffer中慢慢累积,如果不进行处理,按照node官网的说法会产生三个后果,
[1] 进程处理速度减慢
[2] gc满负荷工作
[3] 内存耗尽
那如何解决背压问题呢?自然而然的可以想到,既然背压问题的出现是因为上下游的速度不一致导致的,那我们可以通过控制上游的消息产生速度使其与下游处理速度一致,就可以了.这是没问题的,现实中也是这么解决的.我们可以通过可写流的write的返回值来向调用者也就是上游反馈下游的状态,在可写流数据缓冲区超过 highWaterMark 或写队列当前忙碌的情况下。write() 将返回 false.当write()返回false时,我们将暂停传入的 Readable 流发送任何数据,并等待消费者再次准备好。一旦数据缓冲区被清空,就会发出"drain"事件并恢复传入的数据流.
为了更好的理解背压问题,下图是node官网中一张图,描述了一个可读流被管道输入到一个可写流的生命周期:
幸运的是只要你不是自定义流,node都会自动帮你处理上述的流程.但很显然,我们处于不幸的那种.
我们创建了一个可读流,每次发生data事件都会调用sock.send方法发送消息,sockjs-client经过多层的处理与判断最终调用了faye-websocket的send方法,值得一提的是,faye-websocket支持pipe方式传输消息,内部维护了drain事件与send的返回值用于处理背压问题,但是sockjs-client的node_modules/sockjs-client/lib/transport/websocket.js的send方法并有没处理从faye-websocket的返回值,也没有处理faye-websocket产生的drain事件,导致faye-websocket不符合写缓冲条件,正在处理缓冲区时,sockjs-client的又一发send过来了,最终在websocket-driver进行writeable判断时,由于不符合条件该消息被直接丢弃.但sockjs-client对此并不关心,继续尽心尽责的向faye-websocket传递数据,然后数据被继续抛弃,于是惨剧发生,大部分的数据都被丢失,但程序没有报任何错误.sockjs-client十分开心以为自己完美完成了任务,但其实大部分消息没有到达传输层就已被丢弃.
解决方案
const SockJS = require('sockjs-client')
const fs = require('fs')
const path = require('path')
const fileName = path.resolve(__dirname, 'large.log');
let sock = new SockJS('http://127.0.0.1:9999/echo', null, {
transports: 'websocket'
});
let no = 0;
let readStream;
sock.onopen = function () {
console.log('open');
readStream = fs.createReadStream(fileName)
readStream.on('data', (chunk) => {
if(sock._transport.ws._driver.messages._paused){
console.log('stream pause')
readStream.pause();
return;
}
console.log('continue');
sock.send(`no:${no++},send:${new Date()},${chunk.toString()}`)
})
// 监听读取结束
readStream.on('end', () => {
sock.send('copy done');
console.log(`send done:${no}`);
})
sock._transport.ws._driver.messages.on('drain',()=>{
readStream.resume();
console.log('stream resume');
})
};
sock.onmessage = function (e) {
// console.log('message', e.data);
};
sock.onclose = function () {
console.log(`close: ${no}`);
process.exit()
};
你也可以使用pipe方法,但是前提是你需要实现一个自定义的可以反馈流状态的可写流,为了让代码更加简洁我们希望,sockjs内部可以将drain事件与发送结果尽可能的向上反馈.所以需要稍微修改一下sockjs-client的代码.
node_modules/sockjs-client/lib/transport/websocket.js
function WebSocketTransport(transUrl, ignore, options) {
...
...
...
this.ws.on('drain',()=>self.emit('drain'))
}
WebSocketTransport.prototype.send = function (data) {
var msg = '[' + data + ']';
debug('send', msg);
return this.ws.send(msg);
};
node_modules/sockjs-client/lib/main.js
SockJS.prototype.send = function (data) {
// #13 - convert anything non-string to string
// TODO this currently turns objects into [object Object]
if (typeof data !== 'string') {
data = '' + data;
}
if (this.readyState === SockJS.CONNECTING) {
throw new Error('InvalidStateError: The connection has not been established yet');
}
if (this.readyState !== SockJS.OPEN) {
return;
}
return this._transport.send(escape.quote(data));
};
下面我们开始修改client.js与我们的自定义可读流ExecaStream
execa-stream.js
var { Writable } = require('node:stream')
, util = require('node:util')
function ExecaStream(sock) {
this.sock = sock;
util.inherits(ExecaStream, Writable);
Writable.call(this);
this.on('drain', () => {
console.log('read stream resume')
})
this.sock.onclose = () => console.log('close')
}
ExecaStream.prototype._write = function (chunk, callback) {
this.write(chunk);
callback();
}
ExecaStream.prototype.write = function (chunk) {
console.log('message');
this.sock._transport.on('drain', () => {
this.emit('drain');
});
if (Buffer.isBuffer(chunk)) chunk = chunk.toString();
return this.sock.send(chunk);
}
module.exports = ExecaStream
execute.sh
echo "start time is $(date "+%D %T")"
cat ./large.log
echo "end time is $(date "+%D %T")"
client.js
import SockJS from 'sockjs-client'
import { join } from "node:path";
import { execa } from 'execa';
import ExecaStream from './execa-stream.js'
const baseDir = './'
let sock = new SockJS('http://127.0.0.1:9999/echo', null, {
transports: 'websocket'
});
const sockStream = new ExecaStream(sock);
const proc = execa(
join(baseDir, 'execute.sh'),
{
all: true,
buffer: false,
stripFinalNewline: false,
env: {}
})
sock.onopen = () => {
proc.all.pipe(sockStream);
}
再次执行日志如下:
client.log
server.log
至此,我们找到并解决了在使用sockjs-client过程中产生的丢消息问题,并进行了改进,改进后client实现了流量控制,服务端也完整的收到了实验用的60w行消息.
参考博客:
[3] 网卡收发包系统结构收发包流程
[4] 【修正版】动图图解!代码执行send成功后,数据就发出去了吗?
后记:十分感谢上面几位大佬写的博客,没有这些博客凭借我这个小白的水平真的解决不了这个问题,关于sockjs-client丢数据的问题,也是困扰了我整整一周,从抓包到看源码各种找资料,也是整的我很崩溃,我在网上包括google,以及知名的开源社区,sockjs-client的issue都没有搜到类似的丢消息问题,我在使用sockjs的过程中当数据量与发送频度较低时,丢消息的情况不会发生,一旦数据量过大,丢数据的情况基本上100%会出现,所以直到现在为止,我还是相信sockjs-client应该是有方法去解决背压问题的,只是我这个小白使用的方式不太对.所以如果这篇文章有幸可以被各位大佬看到,不吝赐教.再次感谢看到最后.