1.背景
之前做了服务运行日志的接入,利用ilogtail组件进行日志的解析及采集。ilogtail需要书写正则表达式对日志内容进行匹配,切分出所需要的字段,实现用户自定义的规则采集。
本来挺好的……然鹅,同事不争气,竟然反馈正则表达式不好写!(╬ ̄皿 ̄) 要你们何用!
于是只好再写个根据样例自动生成正则表达式的方法。
2.代码
仅供参考的Demo,正则库不全,后续继续补充。仅供参考!!!
直接贴代码:
/**
* 自动生成正则表达式工具类
*
*/
public class RegexFactory {
// 正则表达式的特殊字符,需要进行转义处理
private String expectChar = ".+*\\$^?{}()[]|";
// 当前默认分隔符
private String currentSep = "\\s";
// 正则匹配得分,当多个正则都满足时,正则越详细得分越高
private int score = 1000;
private SimpleDateFormat year = new SimpleDateFormat("yyyy");
//竖线分隔符
private static final String verticalSep = "\\|";
//空字符分隔符
private static final String spaceSep = "\\s";
//冒号分隔符
private static final String colonSep = ":";
//逗号分隔符
private static final String commaSep = ",";
public RegexFactory(){}
public RegexFactory(String split){
this.currentSep = split;
}
/**
* 校验正则表达式,如果多个正则表达式都符合,长度越少越精确
* @param logPart
* @param regex
* @return
*/
public boolean verifyPattern(String logPart,String regex){
Pattern compile = Pattern.compile(regex);
Matcher matcher = compile.matcher(logPart);
if(matcher.matches()){
if(regex.length() < score){
score = regex.length();
return true;
}
}
return false;
}
/**
* 接收确认日志分隔符
* @param index
*/
public void confirmSeparator(int index,String sep){
switch (index){
case 0 : currentSep = verticalSep; break;
case 1 :
case 3 : currentSep = spaceSep; break;
case 2 : currentSep = colonSep; break;
case 4 : currentSep = commaSep; break;
case 5 : if(expectChar.contains(sep)){
currentSep = "\\" + sep;
}else{
currentSep = sep;
}
break;
default : break;
}
}
/**
* 行首正则
* @param lineStart
*/
public String lineStartVerity(String lineStart){
String lineStartRegex = "";
for(String regex : RegexRepo.lineStartList){
if(verifyPattern(lineStart,regex)){
lineStartRegex = regex;
}
}
return lineStartRegex;
}
/**
* 行正则
* @param line
* @return
*/
public String lineVerity(String line){
init();
String thisYear = year.format(new Date());
String lineRegex = "";
//判断是否为日期类型
if(line.contains(thisYear)){
for(String regex : RegexRepo.dateList){
if(verifyPattern(line,regex)){
lineRegex = regex;
}
}
if(StringUtils.isNotEmpty(lineRegex)){
return lineRegex;
}
}
//初始化分数,判断是否为通用型正则如ip、手机号等
init();
for(String regex : RegexRepo.generalList){
if(verifyPattern(line,regex)){
lineRegex = regex;
}
}
if(StringUtils.isNotEmpty(lineRegex)){
return lineRegex;
}
//初始化分数,过滤正则库所有正则
init();
for(String regex : RegexRepo.allList){
if(verifyPattern(line,regex)){
lineRegex = regex;
}
}
if(StringUtils.isNotEmpty(lineRegex)){
return lineRegex;
}
//无法匹配则定为通配
lineRegex = RegexRepo.allRegex;
return lineRegex;
}
/**
* 自动生成行正则
* @param line
* @param index
* @param sep
* @return 行首正则/行正则
*/
public String[] lineRegexFactory(String line,int index,String sep){
String[] result = new String[2];
StringBuffer sb = new StringBuffer();
confirmSeparator(index, sep);
String[] linePart = line.split(currentSep, -1);
for(int i = 0 ; i < linePart.length ; i++ ){
if(i == 0){
String start = lineStartVerity(linePart[i]);
if(StringUtils.isNotEmpty(start)){
result[0] = start;
sb.append(start).append(currentSep);
}else {
return null;
}
}else if(i < linePart.length-1 ){
if(!linePart[i].equals("")){
String regex = lineVerity(linePart[i]);
sb.append(regex).append(currentSep);
}else{
sb.append(currentSep);
}
} else {
if(!linePart[i].equals("")){
String regex = lineVerity(linePart[i]);
sb.append(regex);
}
}
}
result[1] = sb.toString();
return result;
}
/**
* 重置分数
*/
public void init(){
score = 1000;
}
}
/**
* 正则表达式仓库
*/
public class RegexRepo {
//匹配任意字符捕获组
public static final String allRegex = "(.*)";
//匹配任意小写字母捕获组
public static final String smallRegex = "([a-z]+)";
//匹配任意小写字母+[捕获组
public static final String smallZoRegex = "\\[([a-z]+)";
//匹配任意小写字母+点捕获组
public static final String smallCommaRegex = "\\[?([a-z,]+)\\s+";
//匹配任意大写字母捕获组
public static final String bigRegex = "([A-Z]+)";
//匹配任意大写字母+[捕获组
public static final String bigSoRegex = "\\[?([A-Z]+)";
//匹配任意大写字母+空格捕获组
public static final String bigSpaceRegex = "([A-Z]+)\\s";
//匹配任意数字捕获组
public static final String digitalRegex = "([0-9]+)";
//匹配任意数字+前置符号捕获组
public static final String digitalSoRegex = "\\[([0-9]+)";
//匹配浮点数字捕获组
public static final String floatRegex = "([0-9\\.]+)";
//匹配任意大小写字母捕获组
public static final String smallBigRegex = "([a-zA-Z]+)";
//匹配任意大写字母+数字捕获组
public static final String bigDigitalRegex = "([A-Z0-9]+)";
//匹配任意大小写字母+点捕获组
public static final String smallBigSoRegex = "([A-Za-z\\.]+)";
//匹配任意大小写字母+点+[捕获组
public static final String smallBigSo1Regex = "\\[?([A-Za-z\\.]+)";
//匹配任意小写字母+数字捕获组
public static final String smallDigitalRegex = "([a-z0-9]+)";
//匹配任意大小写字母+数字捕获组
public static final String smallBigDigitalRegex = "([a-zA-Z0-9]+)";
//匹配任意大小写字母+数字+点+冒号捕获组
public static final String smallBigDigitalSoRegex = "([a-zA-Z0-9:\\.]+)";
//匹配任意大小写字母+数字+横线捕获组
public static final String sbdhRegex = "([a-zA-Z0-9-]+)";
//匹配任意大小写字母+数字+横线+[捕获组
public static final String sbdhzRegex = "\\s?\\[?([a-zA-Z0-9-]+)";
//匹配任意大小写字母+数字+横线+下划线捕获组
public static final String sbdhdRegex = "([a-zA-Z0-9-_]+)";
//匹配任意大小写字母+数字+横线+下划线+#+点捕获组
public static final String sbdhdSoRegex = "([a-zA-Z0-9-_#\\.]+)";
//匹配线程号捕获组
public static final String threadRegex = "\\[([a-zA-Z0-9-_#\\.]+)\\s?]?";
//匹配任务+UUID捕获组
public static final String taskRegex = "\\[([a-zA-Z0-9-_\\+]+)\\]\\[([a-zA-Z0-9-_]+)\\]";
//匹配中文
//匹配任意中文捕获组
public static final String chinaRegex = "([\\u4e00-\\u9fa5]+)";
//匹配任意中文+数字捕获组
public static final String chinaDigitalRegex = "([0-9\\u4e00-\\u9fa5]+)";
//匹配任意中文+小写英文捕获组
public static final String chinaEnRegex = "([a-z\\u4e00-\\u9fa5]+)";
//匹配任意中文+英文等捕获组
public static final String chinaEnSoRegex = "([a-zA-Z0-9-_::\\u4e00-\\u9fa5]+)";
//匹配手机号(+86)18222222222
public static final String phoneRegex = "(\\+?[6|8|-]{0,2}\\s?1[3-9]\\d{9})";
//匹配手机号带横线(+86)182-2222-2222
public static final String phoneSpaceRegex = "(\\+?[6|8]{0,2}1[3-9][0-9]-\\d{4}-\\d{4})";
//匹配身份证号
public static final String cardRegex = "(\\d{17}[0-9Xx]|\\d{15})";
//匹配邮箱地址
public static final String emailRegex = "([-+\\.\\w]+@[-\\.\\w+]+)";
//匹配IP地址
public static final String ipRegex = "(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})";
//匹配URL
public static final String urlRegex = "(https?:///[\\w-\\./?%&=]+)";
//匹配域名
public static final String domainRegex = "((https?:\\/\\/)?([\\w-]+\\.)+\\w+(\\:\\d{2,6})?)";
//日期格式
//格式1:2023-08-01
public static final String date1Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2})";
//格式2:2023/08/01
public static final String date2Regex = "([0-9]{4}/[0-9]{2}/[0-9]{2})";
//格式3:08/01/2023
public static final String date3Regex = "([0-9]{2}/[0-9]{2}/[0-9]{4})";
//格式4:2023年08月01日
public static final String date4Regex = "([0-9]{4}年[0-9]{2}月[0-9]{2}日)";
//格式5:2023-08-01 09:58:21
public static final String date5Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
//格式6:2023/08/01 09:58:21
public static final String date6Regex = "([0-9]{4}/[0-9]{2}/[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
//格式7:2023-08-01T09:58:21Z
public static final String date7Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z)";
//格式8:2023-08-01.09:58:21
public static final String date8Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\.[0-9]{2}:[0-9]{2}:[0-9]{2})";
//格式9:2023-08-01 09:58:21.334
public static final String date9Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6})";
//格式10:2023-08-01 09:58:21,334
public static final String date10Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3,6})";
//格式11:2023年08月01日 18时25分35秒
public static final String date11Regex = "([0-9]{4}年[0-9]{2}月[0-9]{2}日\\s[0-9]{2}时[0-9]{2}分[0-9]{2}秒)";
//格式12:Aug 01 09:58:21
public static final String date12Regex = "([a-zA-Z]{3}\\s[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
//格式13:2023-08-01T09:58:21.334Z
public static final String date13Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6}Z)";
//格式14:Thu Aug 01 09:58:21 CST 2023
public static final String date14Regex = "([a-zA-Z]{3}\\s[a-zA-Z]{3}\\s[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}\\s[a-zA-Z]{3}\\s[0-9]{4})";
//格式15:0801:09:58:21.334
public static final String date15Regex = "([0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6})";
//格式16:2023-08-01T09:58:21.334+0000
public static final String date16Regex = "\\[?([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6}\\+[0-9]{4})";
//格式17:2023-08-01T09:58:21,334
public static final String date17Regex = "\\[?([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3,6})";
//行首正则表达式列表
public static final String[] lineStartList = new String[]{date1Regex,date5Regex,date7Regex,date8Regex,date9Regex,date10Regex,date12Regex,date16Regex,date13Regex,date17Regex,sbdhdRegex,threadRegex};
//时间正则列表
public static final String[] dateList = new String[]{date1Regex,date2Regex,date3Regex,date4Regex,date5Regex,date6Regex,date7Regex,date8Regex,date9Regex,date10Regex,date11Regex,date12Regex,date13Regex,date14Regex,date15Regex,date16Regex,date17Regex};
//通用正则列表
public static final String[] generalList = new String[]{phoneRegex,phoneSpaceRegex,cardRegex,emailRegex,ipRegex,urlRegex};
//所有正则列表
public static final String[] allList = new String[]{smallRegex,bigRegex,digitalRegex,digitalSoRegex,floatRegex,smallBigRegex,bigDigitalRegex,smallDigitalRegex,smallBigDigitalRegex,
smallBigSoRegex,sbdhdSoRegex,chinaEnSoRegex,chinaDigitalRegex,bigSpaceRegex,smallBigDigitalSoRegex,smallCommaRegex,bigSoRegex,smallBigSo1Regex,smallZoRegex,
sbdhRegex,sbdhdRegex,threadRegex,taskRegex,chinaRegex,chinaEnRegex,phoneRegex,phoneSpaceRegex,cardRegex,emailRegex,ipRegex,urlRegex,date1Regex,date2Regex,sbdhzRegex,
date3Regex,date4Regex,date5Regex,date6Regex,date7Regex,date8Regex,date9Regex,date10Regex,date11Regex,date12Regex,date13Regex,date14Regex,date15Regex,date16Regex};
}
/**
* 分隔符枚举类
*/
public enum SeparatorEnum {
VERTICAL(0, "竖线'|'"),
SPACE(1, "空格"),
COLON(2, "冒号"),
TAB(3, "Tab键"),
COMMA(4, "逗号"),
OTHER(5, "其他");
private int index;
private String name;
SeparatorEnum(int index, String name) {
this.index = index;
this.name = name;
}
public int getIndex() {
return index;
}
public String getName() {
return name;
}
public static String getNameByIndex(int index) {
SeparatorEnum e = getByIndex(index);
return e == null ? "" : e.getName();
}
public static SeparatorEnum getByIndex(int index) {
for(SeparatorEnum e : SeparatorEnum.values()) {
if(e.getIndex() == index) {
return e;
}
}
return null;
}
}
验证方式:
RegexFactory regexFactory = new RegexFactory();
String str = "[2023-08-25T10:17:35,203][DEBUG][o.e.a.s.TransportSearchAction] [node-1] All shards failed for phase: [query]\n asada \n sdasf";
String[] strings = regexFactory.lineRegexFactory(str, 5, "]");
for(String s : strings){
System.out.println(s);
}
3.结语
以上全部代码,请指正! 拜谢~