采用csv文件保存脚本,程序启动时就读取指定的csv文件,然后根据脚本对控制台输入的文本数据进行匹配,并输出结果或者执行python语句。
# -*- coding: utf-8 -*-
import csv,sys,re,string
###################################################################################################
#全局遍历定义
###################################################################################################
#当前的所有rule 一条规则的内容:{name, sub_rules, script, result}
#sub_rules结构:[[op, match_content],...]
#result结构:[[op,result_content],...]
global_rules = []
#读取的所有内容行
global_contents = []
#配置文件中的正则表达式
#["代码","正则表达式"]
global_config_regex = []
global_config_regex_str = [""]
#输入参数之一:是否支持去重复项,即如果输出信息中,有多行内容是完全相同的,则只输出一次
sys_args = {"single_result":True}
arg_allow_same_help = []
arg_allow_same_script_help = []
###################################################################################################
#符号定义说明
###################################################################################################
#文本输入结束符
END_INPUT_FLAG = "[---END---]"
#文本数据替代符号
MATCHED_LINE_FLAG = "__MATCHED_LINE"
LINE_ANY_STR = "__LINE_ANY_STR"
ANY_NUM_LINE = "__ANY_NUM_LINE-"
ANY_NUM_LINE2 = "__ANY_NUM_LINE-"
#匹配的关键字
RULE_KEY_INCLUDE = "include"
RULE_KEY_INCLUDEOR = "include-or"
RULE_KEY_EQUAL = "equal"
RULE_KEY_PATTERN = "pattern"
###################################################################################################
#函数定义
###################################################################################################
###################################################################################################
#主函数
def main():
if len(sys.argv) <= 1:
print "smartscript script_file"
return
script_file = sys.argv[1]
if len(sys.argv) == 3:
if sys.argv[2] == "-s": #不允许出现重复项
sys_args["single_result"] = False
read_regex(global_config_regex, global_config_regex_str) #读取正则表达式配置
if False == read_rules(script_file):
return
if False == read_content():
return
do_parse(global_rules, global_contents)
#执行文本解析
def do_parse(rules_all, contents_all):
#循环遍历脚本
for rule in rules_all:
#针对每一个规则去匹配所有数据文本
single_rule_match(rule, contents_all)
return
###################################################################################################
#对一个规则匹配所有数据文本
def single_rule_match(rule, contents_all):
#根据规则不同,执行各自的匹配方法
sub_rule = rule["sub_rules"]
sub_rule_type = sub_rule[0][0]
arg_cache0 = [] #参数存储器0,用于保存输入参数
arg_cache1 = [] #参数存储器1,用于保存匹配的结果
arg_item = {"matched_line":[], "values":{}}
if RULE_KEY_INCLUDE == sub_rule_type:
#include规则匹配
do_include_match(rule, contents_all, arg_cache0, arg_cache1)
elif RULE_KEY_INCLUDEOR == sub_rule_type:
#include_or规则匹配
return
elif RULE_KEY_EQUAL == sub_rule_type:
#equal规则匹配
do_equal_match(rule, contents_all, arg_cache0, arg_cache1)
elif RULE_KEY_PATTERN == sub_rule_type:
#pattern规则匹配
do_equal_match(rule, contents_all, arg_cache0, arg_cache1)
else:
return
#匹配成功则执行结果
if(len(arg_cache1) > 0):
# print len(arg_cache1)
# print "---------------------------------"
do_result1(rule, arg_cache1)
return
###################################################################################################
#匹配成功后执行结果,为了脚本的中间变量能在result中使用,
#执行脚本和输出结果写在同一个函数中
def do_result1(rule, arg_cache1):
# print arg_cache1
for args in arg_cache1:
_VALUE_ = args["values"]
matched_line = args["matched_line"]
#scripts = rule["script"]
results = rule["result"]
#先执行脚本
# for s in scripts:
# exec(s)
#再执行结果
for r in results:
if "print" == r[0]:
_MATCHED_LINE_ = matched_line[0]
result = r[1]
for i in range(0,len(matched_line)):
if i != 0:
_MATCHED_LINE_ = _MATCHED_LINE_ + "\r\n" + matched_line[i]
result = result.replace(MATCHED_LINE_FLAG, _MATCHED_LINE_) #替换命中行关键字
#过滤重复项
if True == sys_args["single_result"]:
for t in arg_allow_same_help:
if t == result:
return
arg_allow_same_help.append(result)
#输出
print result
if "script" == r[0]:
#过滤重复项
if True == sys_args["single_result"]:
for t in arg_allow_same_script_help:
if t[0] == r[1] and is_list_same(t[1], args):
return
tmp = [r[1], args]
arg_allow_same_script_help.append(tmp)
#执行脚本
exec(r[1])
return
#比较两个list是否相同
def is_list_same(a,b):
if len(a) != len(b):
return False
c = 0
for t1 in a:
for t2 in b:
if t1 == t2 and a[t1] == b[t2]:
c = c + 1
break
if c == len(a):
return True
return False
###################################################################################################
#include规则匹配
def do_include_match(rule, contents_all, arg_cache0, arg_cache1):
sub_rule = rule["sub_rules"]
rule_item = sub_rule[0][1]
arg_item = {"matched_line":[], "values":{}}
for line in contents_all:
if -1 != line.find(rule_item):
arg_item["matched_line"].append(line)
arg_cache1.append(arg_item)
return
###################################################################################################
#equal规则匹配,equal规则是每行完全匹配
def do_equal_match(rule, contents_all, arg_cache0, arg_cache1):
sub_rule = rule["sub_rules"]
rule_tmp = get_lines(sub_rule[0][1])
rule_type = sub_rule[0][0]
lines_tmp = []
for line in contents_all:
lines_tmp.append(line)
any_num_line = 0
next_match_rule_line = 0
if len(lines_tmp) >= len(rule_tmp):
c = 0
j = 0
matched = False
arg_item = {"matched_line":[], "values":{}}
#开始匹配
for i in range(0,len(lines_tmp)):
#先做ANY_NUM_LINE匹配,如果rule为ANY_NUM_LINE,则获取最大跳过的行数,
#直接匹配下一个rule行,如果匹配成功则跳到下下个rule行比较
# print i,j,len(rule_tmp)
# print rule_tmp[j]
if 0 == any_num_line and -1 != rule_tmp[j].find(ANY_NUM_LINE):
t = rule_tmp[j].split("-")
if len(t) >= 2:
any_num_line = string.atoi(t[1]) + 2 #获取最大跳过行数
next_match_rule_line = j + 1 #下一个rule行的下标
if next_match_rule_line >= len(rule_tmp):
next_match_rule_line = 0
if any_num_line > 0: #控制跳过行数
any_num_line = any_num_line - 1
if 0 != any_num_line and 0 != next_match_rule_line: #条件满足的情况下进行下一个rule行匹配
# print lines_tmp[i],rule_tmp[next_match_rule_line]
if True == is_match(lines_tmp[i], rule_tmp[next_match_rule_line], rule_type, arg_item["values"]):
j = j + 2
# print "match:",j,len(rule_tmp)
next_match_rule_line = 0
any_num_line = 0
if j == len(rule_tmp): #命中最后一个rule行,则认为完全匹配成功
matched = True
break
#下一个rule行匹配匹配成功,则清空ANY_NUM_LINE
continue
#非ANY_NUM_LINE比较,这个是逐行比较,必须每行都一致
if False == is_match(lines_tmp[i], rule_tmp[j], rule_type, arg_item["values"]):
lines_tmp.pop(0)
c = -1
break
else:
if j == len(rule_tmp) - 1:
matched = True
break
j = j + 1 #匹配成功一个,rule就向后移动一个
c = c + 1
if matched: #匹配成功
# print lines_tmp
for tmp in lines_tmp:
arg_item["matched_line"].append(tmp)
arg_cache1.append(arg_item)
del lines_tmp[0:len(lines_tmp)] #清空lines_tmp
return
###################################################################################################
#比较一行是否匹配,支持equal和pattern
def is_match(line, rule, type, values):
if LINE_ANY_STR == rule:
return True
if RULE_KEY_EQUAL == type and line == rule:
return True
if RULE_KEY_PATTERN == type and pattern_line_match(rule, line, values):
return True
return False
###################################################################################################
#格式化输入的文本数据,去除一些不需要的字符
def line_format(line):
#前后空格要去除
line = line.strip()
return line
###################################################################################################
#格式化多行,将\r\n改为\n,将每行前后的空格去除,并去除空行
def mutiline_format(txt):
txt = txt.replace("\r\n", "\n")
tmp = txt.split("\n")
txt = ""
line_count = 0
for t in tmp:
t = t.strip()
if t == "":
continue
if "" == txt:
txt = t
else:
txt = txt + "\n" + t
return txt
###################################################################################################
#读取脚本文件
def read_rules(rules_file):
fp = csv.reader(open(rules_file,"rb"))
if None == fp:
return False
try:
i = 0
for row in fp:
#跳过第一行
if 0 == i:
i = i + 1
continue
if check_rule(row):
add_rule(row)
else:
print "add rule fail:",row
except csv.Error,e:
print e
return False
return True
###################################################################################################
#读取正则表达式的配置文件
def read_regex(regex, regex_str):
c = 0
fp = csv.reader(open("smartscript/pattern.csv", "rb"))
if None == fp:
print "正则表达式文件损坏"
return
for row in fp:
if 0 == c: #第一行为注释行,直接跳过
c = c + 1
continue
c = c + 1
config_item = [row[0], row[1]]
if "" == regex_str[0]:
regex_str[0] = row[0]
else:
regex_str[0] = regex_str[0] + "|" + row[0]
regex.append(config_item)
return
###################################################################################################
#添加rule到global_rules中去
def add_rule(line):
code = get_regex_str(mutiline_format(line[2]), global_config_regex, global_config_regex_str[0])
sub_rule = [line[1], code]
sub_result = [line[3], line[4]]
# sub_script = line[5]
#查找同名的rule
for item in global_rules:
if item["name"] == line[0]:
#如果存在,则当做同一个rule的sub_rule
item["sub_rules"].append(sub_rule)
item["result"].append(sub_result)
# item["script"].append(sub_script)
return True
#不存在则当做新的插入
#rule = {"name":line[0], "sub_rules":[sub_rule], "script":[sub_script], "result":[sub_result]}
rule = {"name":line[0], "sub_rules":[sub_rule], "result":[sub_result]}
global_rules.append(rule)
return True
###################################################################################################
#检查脚本是否合法
def check_rule(script):
return True
###################################################################################################
#从标准输入流读取需要分析的文本数
def read_content():
while True:
line = raw_input()
line = line_format(line)
if line == "":
continue
if END_INPUT_FLAG == line:
return True
global_contents.append(line)
return True
###################################################################################################
#正则表达式匹配
def pattern_line_match(rule, line, rule_reg2):
if None == rule or None == line:
return False
rule_pattern = re.compile(rule)
if None == rule_pattern:
return False
matched = re.match(rule_pattern, line)
if None == matched:
return False
keys = []
get_key(keys, rule)
for k in keys:
a1,a2 = is_integer(matched.group(k))
rule_reg2[k] = a2
return True
###################################################################################################
#从规则中取出关键字
def get_key(value_list, pattern_str):
f1 = ''
f2 = 0
f3 = 0
for i in range(0, len(pattern_str)):
#print f1,f2,f3,i,pattern_str[i]
if pattern_str[i] == '(':
f1 = '('
f2 = i
continue
if f2 == i - 1:
if ('(' == f1) and ('?' == pattern_str[i]):
f1 = '?'
f2 = i
elif ('?' == f1) and ('P' == pattern_str[i]):
f1 = 'P'
f2 = i
elif ('P' == f1) and ('<' == pattern_str[i]):
f3 = i + 1
elif f3 != 0:
if '>' == pattern_str[i]:
#print pattern_str[f3:i]
value_list.append(pattern_str[f3:i])
f3 = 0
else:
f3 = 0
f1 = ''
f2 = 0
return
###################################################################################################
#将一个字符串按照\n分为数组
def get_lines(txt):
tmp = txt.replace("\r\n", "\n")
return txt.split("\n")
###################################################################################################
#检查参数是否是一个数字
def is_integer(s):
c = r'\d+'
d = re.match(c, s)
if None == d:
return False, s
c = r'\D+'
if None != re.search(c, s):
return False, s
return True, string.atoi(s)
###################################################################################################
#将rule中的预定义正则表达式转换为真实的正则表达式
def get_regex_str(org, regex_list, code_str):
s0 = org
r1 = "((%s)-([a-zA-Z0-9]{1,15})-)" %code_str
p1 = re.compile(r1)
s1 = re.findall(r1, org)
if None != s1:
for s2 in s1:
s3 = get_regex_by_code(regex_list, s2[1])
s4 = "(?P<%s>(%s))" %(s2[2], s3)
s0 = s0.replace(s2[0], s4)
return s0
###################################################################################################
#根据code,从正则表达式列表中查询正则表达式内容
def get_regex_by_code(regex_list, code):
for p in regex_list:
if p[0] == code:
return p[1]
return ""
###################################################################################################
#执行主函数
###################################################################################################
if __name__ == "__main__":
main()