还需要考虑不同异常情况
helloworld.py
import datetime
import time
import os
def doSth():
print('hello')
n=os.system('./desktop/apache-tomcat-8.5.28/bin/startup.sh')
print('thats all')
time.sleep(60)
def main(h=10,m=52):
while True:
while True:
now = datetime.datetime.now()
print(now)
if now.hour == h and now.minute == m:
break
time.sleep(20)
doSth()
if __name__ == '__main__' :
main()
运行时
nohup python helloworld.py &
&的意思是在后台运行,什么意思呢? 意思是说,当你在运行py的时候, 即使你用ctrl C, 那么helloworld.py照样运行(因为对 SIGINT信号免疫)。但是要注意,如果你直接关掉shell后,那么,helloworld.py进程同样消失。因为&对SIGHUP信号不免疫。
nohup的意思是忽略SIGHUP信号, 所以当运行nohup的时候, 关闭shell, 那么helloworld.py进程还是存在的(对SIGHUP信号免疫)。但是,要注意,如果你直接在shell中用Ctrl C, 那么, helloworld.py进程也是会消失的(因为对SIGINT信号不免疫)。
所以, &和nohup没有半毛钱的关系, 要让进程真正不受shell中Ctrl C和shell关闭的影响, 那该怎么办呢? 那就都用吧,两全其美。
nohup 和 &颇有点让helloworld.py成为守护进程的感觉。
但如果helloworld.py自身挂了怎么办呢,这个时候我们就考虑到monit
monit最大的特点是配置文件简单易读,同时支持进程和系统状态的监控,并灵活的提供了各种检测的方式,周期,并进行报警和响应(重启服务,执行命令等)
安装monit:
yum -y install monit
启动monit服务:
systemctl start monit.service
下面主要看monit配置文件和监控进程hang住并处理的脚本:
配置文件storaged.conf:
[general]
time_interval = 5
storage_port = 9092
client_conf_file = /etc/fdfs/client.conf
[notice]
isemail = 1
issms = 1
ishotchat = 1
noticeusers= niuliguo
### Logging configuration
[loggers]
keys=root
[handlers]
keys=file_handler
[formatters]
keys=formatter
[logger_root]
level=DEBUG
handlers=file_handler
[handler_file_handler]
class=FileHandler
level=DEBUG
formatter=formatter
args=('/var/log/fdfs_storaged_protector.logs','a+')
[formatter_formatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
脚本fdfs_storaged_protector.py:
import os
import sys
import logging
import logging.handlers
import logging.config
import httplib
import json
import urllib
import socket
import time
import fcntl
from random import randint
from subprocess import PIPE,Popen
from signal import alarm,signal,SIGALRM,SIGKILL
from ConfigParser import ConfigParser
reload(sys)
sys.setdefaultencoding('utf-8')
__rootdir = os.path.realpath( os.path.dirname(__file__) )
if __rootdir not in sys.path:
sys.path.append(__rootdir)
class Protector(object):
def __init__(self,conf_path,rootdir):
self.conf_general = 'general'
self.conf_notice = 'notice'
self.rootdir = rootdir
self.ipaddr = socket.gethostbyname( socket.gethostname() )
config_parser = ConfigParser()
if config_parser.read(conf_path):
self.time_interval = int( config_parser.get(self.conf_general,'time_interval','30') )
self.storage_port = int( config_parser.get(self.conf_general,'storage_port','9092') )
self.client_conf_file = config_parser.get(self.conf_general,'client_conf_file','/etc/fdfs/client.conf')
self.isemail = config_parser.get(self.conf_notice,'isemail','1')
self.issms = config_parser.get(self.conf_notice,'issms','1')
self.ishotchat = config_parser.get(self.conf_notice,'ishotchat','1')
self.noticeusers = config_parser.get(self.conf_notice,'noticeusers','niuliguo')
logging.config.fileConfig(conf_path)
self.logger = logging.getLogger('fdfs_storaged_protector')
def handler(self,signum,frame):
raise IOError("Timeout!")
def run(self,args,cwd=None,shell=False,kill_tree=True,timeout=-1,env=None):
'''
Run a command in timeout,If out of time,process will be killed.
'''
class Alarm(Exception):
pass
def alarm_handler(signum,frame):
raise Alarm
p = Popen(args,shell = shell,cwd = cwd,stdout = PIPE,stderr = PIPE,env=env)
if timeout != -1:
signal(SIGALRM,alarm_handler)
alarm(timeout)
try:
stdout,stderr = p.communicate()
if timeout != -1:
alarm(0)
except Alarm:
pids = [p.pid]
if kill_tree:
pids.extend(self.get_process_children(p.pid))
for pid in pids:
try:
os.kill(pid,SIGALRM)
except OSError:
pass
return -9,'',''
return p.returncode,stdout,stderr
def get_process_children(self,pid):
p = Popen('ps --no-headers -o pid --ppid %d' % pid ,shell= True,
stdout=PIPE,stderr = PIPE)
stdout,stderr = p.communicate()
return [ int(p) for p in stdout.split() ]
def generate_small_file(self):
try:
with open( "%s/smallfile" % (self.rootdir),"wb") as fout:
fout.write(os.urandom( 1024 * randint(80,180)) )
if os.path.exists(os.sep.join([self.rootdir,"smallfile"])):
self.logger.info("generate small file success")
else:
self.generate_small_file()
except IOError:
self.logger('generate small file fail!')
def upload(self):
if os.path.exists(os.sep.join([self.rootdir,"smallfile"])):
cmd = 'fdfs_upload_file %s %s %s:%d > /dev/null' % (self.client_conf_file , os.sep.join([self.rootdir,"smallfile"]) , self.ipaddr , self.storage_port )
return self.run(cmd,shell=True,timeout=10)
else:
self.logger.error("file %s not exist" % os.sep.join([self.rootdir,"smallfile"]))
return -1,"","file missed"
def monit(self):
while True:
retcode,stdout,stderr = self.upload()
if 0 == retcode:
self.logger.info("upload file success and the fdfs process is healthy")
time.sleep(self.time_interval)
elif stderr == "file missed":
self.logger.error(stderr)
self.generate_small_file()
else:
self.logger.error(stderr)
break
def restart_process(self):
'''
1.kill the fdfs_storaged process
2.try to restart fdfs_storaged process for three times
'''
try:
###step one: kill process
cmd = 'ps -ef | grep \'/usr/bin/fdfs_storaged\'| grep -v grep | awk \'{print $2}\' | xargs kill -9'
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
###step two: restart process
flag = False
for i in range(0,3):
cmd = "systemctl stop fdfs_storaged.service && systemctl start fdfs_storaged.service"
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
if 0 == retcode:
flag = True
break
if flag:
self.logger.info("restart fdfs_storaged success!")
else:
self.logger.info("restart fdfs_storaged fail!")
except:
self.logger.error("restart fdfs_storaged fail!")
exit(1)
def alarm(self):
'''
insert your own alarm code
'''
def restart_self(self):
cmd = 'sh run.sh'
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
if 0 == retcode:
self.logger.info("restart fdfs_storaged_protector process success")
else:
self.logger.error("restart fdfs_storaged_protector process fail")
if __name__ == '__main__':
conf_path = os.sep.join([__rootdir,"storaged.conf"])
protector = Protector(conf_path,__rootdir)
try:
protector.generate_small_file()
protector.monit()
raise Exception
except Exception:
protector.restart_process()
protector.alarm()
protector.restart_self()
except KeyboardInterrupt:
exit(1)
except IOError:
exit(1)
出处来自于https://blog.csdn.net/lavorange/article/details/51721330
牛哥小tips:1、ps aux | grep xxx的|是管道的意思,要专业
2、hang住是进程阻塞的意思,=block
3、写方法传参数进来时要判断参数是否为空,不然可能会全崩