产生背景: 在 k8s 集群安装 SQLFlow 后, 每过一段时间 sqlflow-server、sqlflow-jupyter、argo-server 三个进程就会自己挂掉. (在 sqlflow 官网中也提到该问题, 说是 Kubernetes 内置逻辑,重新运行端口映射命令即可)
解决方式: 使用 shell 脚本, 定时每分钟检查三个进程是否挂掉, 若挂掉则重启, 否则输出 already started!
[root@dooo ~]# cd /sqlflow/shell/
[root@dooo shell]# cat restart.sh
#!/bin/sh
while true
do
ps -ef | grep "deployment/sqlflow-server" | grep -v "grep"
if [ $? -gt 0 ]
then
./sqlflow-server.sh
echo "sqlflow-server process has been restarted!"
else
echo "sqlflow-server process already started!"
fi
ps -ef | grep "deployment/sqlflow-jupyter" | grep -v "grep"
if [ $? -gt 0 ]
then
./sqlflow-jupyter-mysql.sh
echo "sqlflow-jupyter process has been restarted!"
else
echo "sqlflow-jupyter process already started!"
fi
ps -ef | grep "deployment/argo-server" | grep -v "grep"
if [ $? -gt 0 ]
then
./argo-server.sh
echo "argo-server process has been restarted!"
else
echo "argo-server process already started!"
fi
sleep 60
done
ps -ef 找不到匹配项, $? > 0