自动生成正则表达式-Demo样例

1.背景

之前做了服务运行日志的接入,利用ilogtail组件进行日志的解析及采集。ilogtail需要书写正则表达式对日志内容进行匹配,切分出所需要的字段,实现用户自定义的规则采集。在这里插入图片描述
本来挺好的……然鹅,同事不争气,竟然反馈正则表达式不好写!(╬ ̄皿 ̄) 要你们何用!
于是只好再写个根据样例自动生成正则表达式的方法。

2.代码

仅供参考的Demo,正则库不全,后续继续补充。仅供参考!!!
直接贴代码:

/**
 * 自动生成正则表达式工具类
 *
 */
public class RegexFactory {
    // 正则表达式的特殊字符,需要进行转义处理
    private String expectChar = ".+*\\$^?{}()[]|";
    // 当前默认分隔符
    private String currentSep = "\\s";
    // 正则匹配得分,当多个正则都满足时,正则越详细得分越高
    private int score = 1000;

    private SimpleDateFormat year = new SimpleDateFormat("yyyy");

    //竖线分隔符
    private static final String verticalSep = "\\|";
    //空字符分隔符
    private static final String spaceSep = "\\s";
    //冒号分隔符
    private static final String colonSep = ":";
    //逗号分隔符
    private static final String commaSep = ",";

    public RegexFactory(){}

    public RegexFactory(String split){
        this.currentSep = split;
    }
    /**
     * 校验正则表达式,如果多个正则表达式都符合,长度越少越精确
     * @param logPart
     * @param regex
     * @return
     */
    public boolean verifyPattern(String logPart,String regex){
        Pattern compile = Pattern.compile(regex);
        Matcher matcher = compile.matcher(logPart);
        if(matcher.matches()){
            if(regex.length() < score){
                score = regex.length();
                return true;
            }
        }
        return false;
    }

    /**
     * 接收确认日志分隔符
     * @param index
     */
    public void confirmSeparator(int index,String sep){
        switch (index){
            case 0 : currentSep = verticalSep; break;
            case 1 :
            case 3 : currentSep = spaceSep; break;
            case 2 : currentSep = colonSep; break;
            case 4 : currentSep = commaSep; break;
            case 5 : if(expectChar.contains(sep)){
                        currentSep = "\\" + sep;
                    }else{
                        currentSep = sep;
                    }
                    break;
            default : break;
        }
    }

    /**
     * 行首正则
     * @param lineStart
     */
    public String lineStartVerity(String lineStart){
        String lineStartRegex = "";
        for(String regex : RegexRepo.lineStartList){
            if(verifyPattern(lineStart,regex)){
                lineStartRegex = regex;
            }
        }
        return lineStartRegex;
    }

    /**
     * 行正则
     * @param line
     * @return
     */
    public String lineVerity(String line){
        init();
        String thisYear = year.format(new Date());
        String lineRegex = "";

        //判断是否为日期类型
        if(line.contains(thisYear)){
            for(String regex : RegexRepo.dateList){
                if(verifyPattern(line,regex)){
                    lineRegex = regex;
                }
            }
            if(StringUtils.isNotEmpty(lineRegex)){
                return lineRegex;
            }
        }

        //初始化分数,判断是否为通用型正则如ip、手机号等
        init();
        for(String regex : RegexRepo.generalList){
            if(verifyPattern(line,regex)){
                lineRegex = regex;
            }
        }
        if(StringUtils.isNotEmpty(lineRegex)){
            return lineRegex;
        }

        //初始化分数,过滤正则库所有正则
        init();
        for(String regex : RegexRepo.allList){
            if(verifyPattern(line,regex)){
                lineRegex = regex;
            }
        }
        if(StringUtils.isNotEmpty(lineRegex)){
            return lineRegex;
        }

        //无法匹配则定为通配
        lineRegex = RegexRepo.allRegex;

        return lineRegex;
    }


    /**
     * 自动生成行正则
     * @param line
     * @param index
     * @param sep
     * @return  行首正则/行正则
     */
    public String[] lineRegexFactory(String line,int index,String sep){
        String[] result = new String[2];
        StringBuffer sb = new StringBuffer();
        confirmSeparator(index, sep);
        String[] linePart = line.split(currentSep, -1);

        for(int i = 0 ; i < linePart.length ; i++ ){
            if(i == 0){
                String start = lineStartVerity(linePart[i]);
                if(StringUtils.isNotEmpty(start)){
                    result[0] = start;
                    sb.append(start).append(currentSep);
                }else {
                    return null;
                }
            }else if(i < linePart.length-1 ){
                if(!linePart[i].equals("")){
                    String regex = lineVerity(linePart[i]);
                    sb.append(regex).append(currentSep);
                }else{
                    sb.append(currentSep);
                }
            } else {
                if(!linePart[i].equals("")){
                    String regex = lineVerity(linePart[i]);
                    sb.append(regex);
                }
            }
        }

        result[1] = sb.toString();
        return result;
    }

    /**
     * 重置分数
     */
    public void init(){
        score = 1000;
    }

}
/**
 * 正则表达式仓库
 */
public class RegexRepo {
    //匹配任意字符捕获组
    public static final String allRegex = "(.*)";
    //匹配任意小写字母捕获组
    public static final String smallRegex = "([a-z]+)";
    //匹配任意小写字母+[捕获组
    public static final String smallZoRegex = "\\[([a-z]+)";
    //匹配任意小写字母+点捕获组
    public static final String smallCommaRegex = "\\[?([a-z,]+)\\s+";
    //匹配任意大写字母捕获组
    public static final String bigRegex = "([A-Z]+)";
    //匹配任意大写字母+[捕获组
    public static final String bigSoRegex = "\\[?([A-Z]+)";
    //匹配任意大写字母+空格捕获组
    public static final String bigSpaceRegex = "([A-Z]+)\\s";
    //匹配任意数字捕获组
    public static final String digitalRegex = "([0-9]+)";
    //匹配任意数字+前置符号捕获组
    public static final String digitalSoRegex = "\\[([0-9]+)";
    //匹配浮点数字捕获组
    public static final String floatRegex = "([0-9\\.]+)";
    //匹配任意大小写字母捕获组
    public static final String smallBigRegex = "([a-zA-Z]+)";
    //匹配任意大写字母+数字捕获组
    public static final String bigDigitalRegex = "([A-Z0-9]+)";
    //匹配任意大小写字母+点捕获组
    public static final String smallBigSoRegex = "([A-Za-z\\.]+)";
    //匹配任意大小写字母+点+[捕获组
    public static final String smallBigSo1Regex = "\\[?([A-Za-z\\.]+)";
    //匹配任意小写字母+数字捕获组
    public static final String smallDigitalRegex = "([a-z0-9]+)";
    //匹配任意大小写字母+数字捕获组
    public static final String smallBigDigitalRegex = "([a-zA-Z0-9]+)";
    //匹配任意大小写字母+数字+点+冒号捕获组
    public static final String smallBigDigitalSoRegex = "([a-zA-Z0-9:\\.]+)";
    //匹配任意大小写字母+数字+横线捕获组
    public static final String sbdhRegex = "([a-zA-Z0-9-]+)";
    //匹配任意大小写字母+数字+横线+[捕获组
    public static final String sbdhzRegex = "\\s?\\[?([a-zA-Z0-9-]+)";
    //匹配任意大小写字母+数字+横线+下划线捕获组
    public static final String sbdhdRegex = "([a-zA-Z0-9-_]+)";
    //匹配任意大小写字母+数字+横线+下划线+#+点捕获组
    public static final String sbdhdSoRegex = "([a-zA-Z0-9-_#\\.]+)";
    //匹配线程号捕获组
    public static final String threadRegex = "\\[([a-zA-Z0-9-_#\\.]+)\\s?]?";
    //匹配任务+UUID捕获组
    public static final String taskRegex = "\\[([a-zA-Z0-9-_\\+]+)\\]\\[([a-zA-Z0-9-_]+)\\]";

    //匹配中文
    //匹配任意中文捕获组
    public static final String chinaRegex = "([\\u4e00-\\u9fa5]+)";
    //匹配任意中文+数字捕获组
    public static final String chinaDigitalRegex = "([0-9\\u4e00-\\u9fa5]+)";
    //匹配任意中文+小写英文捕获组
    public static final String chinaEnRegex = "([a-z\\u4e00-\\u9fa5]+)";
    //匹配任意中文+英文等捕获组
    public static final String chinaEnSoRegex = "([a-zA-Z0-9-_::\\u4e00-\\u9fa5]+)";

    //匹配手机号(+86)18222222222
    public static final String phoneRegex = "(\\+?[6|8|-]{0,2}\\s?1[3-9]\\d{9})";
    //匹配手机号带横线(+86)182-2222-2222
    public static final String phoneSpaceRegex = "(\\+?[6|8]{0,2}1[3-9][0-9]-\\d{4}-\\d{4})";

    //匹配身份证号
    public static final String cardRegex = "(\\d{17}[0-9Xx]|\\d{15})";

    //匹配邮箱地址
    public static final String emailRegex = "([-+\\.\\w]+@[-\\.\\w+]+)";

    //匹配IP地址
    public static final String ipRegex = "(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})";

    //匹配URL
    public static final String urlRegex = "(https?:///[\\w-\\./?%&=]+)";
    //匹配域名
    public static final String domainRegex = "((https?:\\/\\/)?([\\w-]+\\.)+\\w+(\\:\\d{2,6})?)";

    //日期格式
    //格式1:2023-08-01
    public static final String date1Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2})";
    //格式2:2023/08/01
    public static final String date2Regex = "([0-9]{4}/[0-9]{2}/[0-9]{2})";
    //格式3:08/01/2023
    public static final String date3Regex = "([0-9]{2}/[0-9]{2}/[0-9]{4})";
    //格式4:2023年08月01日
    public static final String date4Regex = "([0-9]{4}年[0-9]{2}月[0-9]{2}日)";
    //格式5:2023-08-01 09:58:21
    public static final String date5Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
    //格式6:2023/08/01 09:58:21
    public static final String date6Regex = "([0-9]{4}/[0-9]{2}/[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
    //格式7:2023-08-01T09:58:21Z
    public static final String date7Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z)";
    //格式8:2023-08-01.09:58:21
    public static final String date8Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\.[0-9]{2}:[0-9]{2}:[0-9]{2})";
    //格式9:2023-08-01 09:58:21.334
    public static final String date9Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6})";
    //格式10:2023-08-01 09:58:21,334
    public static final String date10Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3,6})";
    //格式11:2023年08月01日 18时25分35秒
    public static final String date11Regex = "([0-9]{4}年[0-9]{2}月[0-9]{2}日\\s[0-9]{2}时[0-9]{2}分[0-9]{2}秒)";
    //格式12:Aug 01 09:58:21
    public static final String date12Regex = "([a-zA-Z]{3}\\s[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2})";
    //格式13:2023-08-01T09:58:21.334Z
    public static final String date13Regex = "([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6}Z)";
    //格式14:Thu Aug 01 09:58:21 CST 2023
    public static final String date14Regex = "([a-zA-Z]{3}\\s[a-zA-Z]{3}\\s[0-9]{2}\\s[0-9]{2}:[0-9]{2}:[0-9]{2}\\s[a-zA-Z]{3}\\s[0-9]{4})";
    //格式15:0801:09:58:21.334
    public static final String date15Regex = "([0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6})";
    //格式16:2023-08-01T09:58:21.334+0000
    public static final String date16Regex = "\\[?([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{3,6}\\+[0-9]{4})";
    //格式17:2023-08-01T09:58:21,334
    public static final String date17Regex = "\\[?([0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3,6})";

    //行首正则表达式列表
    public static final String[] lineStartList = new String[]{date1Regex,date5Regex,date7Regex,date8Regex,date9Regex,date10Regex,date12Regex,date16Regex,date13Regex,date17Regex,sbdhdRegex,threadRegex};

    //时间正则列表
    public static final String[] dateList = new String[]{date1Regex,date2Regex,date3Regex,date4Regex,date5Regex,date6Regex,date7Regex,date8Regex,date9Regex,date10Regex,date11Regex,date12Regex,date13Regex,date14Regex,date15Regex,date16Regex,date17Regex};

    //通用正则列表
    public static final String[] generalList = new String[]{phoneRegex,phoneSpaceRegex,cardRegex,emailRegex,ipRegex,urlRegex};

    //所有正则列表
    public static final String[] allList = new String[]{smallRegex,bigRegex,digitalRegex,digitalSoRegex,floatRegex,smallBigRegex,bigDigitalRegex,smallDigitalRegex,smallBigDigitalRegex,
            smallBigSoRegex,sbdhdSoRegex,chinaEnSoRegex,chinaDigitalRegex,bigSpaceRegex,smallBigDigitalSoRegex,smallCommaRegex,bigSoRegex,smallBigSo1Regex,smallZoRegex,
            sbdhRegex,sbdhdRegex,threadRegex,taskRegex,chinaRegex,chinaEnRegex,phoneRegex,phoneSpaceRegex,cardRegex,emailRegex,ipRegex,urlRegex,date1Regex,date2Regex,sbdhzRegex,
            date3Regex,date4Regex,date5Regex,date6Regex,date7Regex,date8Regex,date9Regex,date10Regex,date11Regex,date12Regex,date13Regex,date14Regex,date15Regex,date16Regex};

}
/**
 * 分隔符枚举类
 */
public enum SeparatorEnum {
    VERTICAL(0, "竖线'|'"),
    SPACE(1, "空格"),
    COLON(2, "冒号"),
    TAB(3, "Tab键"),
    COMMA(4, "逗号"),
    OTHER(5, "其他");


    private int index;

    private String name;

    SeparatorEnum(int index, String name) {
        this.index = index;
        this.name = name;

    }

    public int getIndex() {
        return index;
    }

    public String getName() {
        return name;
    }


    public static String getNameByIndex(int index) {
        SeparatorEnum e = getByIndex(index);
        return e == null ? "" : e.getName();
    }


    public static SeparatorEnum getByIndex(int index) {
        for(SeparatorEnum e : SeparatorEnum.values()) {
            if(e.getIndex() == index) {
                return e;
            }
        }
        return null;
    }
}

验证方式:

		RegexFactory regexFactory = new RegexFactory();
        String str = "[2023-08-25T10:17:35,203][DEBUG][o.e.a.s.TransportSearchAction] [node-1] All shards failed for phase: [query]\n asada  \n sdasf";
        String[] strings = regexFactory.lineRegexFactory(str, 5, "]");

        for(String s : strings){
            System.out.println(s);
        }

3.结语

以上全部代码,请指正! 拜谢~

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值