抽取文本中的日期和时间

写这种算法真的是累啊,我是没辙,只能拿正则了,各位仁兄有何高见?

测试效果:
这里写图片描述

public class KeyWordsTest {

    String [] input = {
            "2017年9月16号",
            "2017年10月3日",
            "2015年7月",
            "9月17号",
            "4月15",
            "6号",
            "星期3",
            "下礼拜5",
            "本周2",
            "周末",
            "下周星期5",
            "星期天",
            "2015-9-7",
            "9-7",
            "2019.9.8",
            "2019.8",
            "9.7",
            "二零一六年九月十八号",
            "三月七号",
            "十月二十",
            "星期六",
            "下礼拜五,我打算回上海",
            "今天十五号,明天十六号",
            "十月一号国庆节,我们打算回家",
            "下午12:56",
            "9:30",
            "09:31",
            "上午七点",
            "下午5点",
            "十点半",
            "今天晚上九点15",
            "十点一刻",
            "二十三点50",
            "十一点",
            "以下是2017年4月10日入住,五道口附近的快捷酒店信息(与用户意图最相关的5个快捷酒店信息),请问您想预定哪一个?",
            "5"//不能匹配
    };
    @Test
    public void testDate() {
        for (int i = 0; i < input.length; i++) {
            System.out.print(input[i]+": ");
            System.out.println(TimeWords.words(input[i]));          
        }       
    }

}

public class TimeWords {
    private static String regex_date;
    private static String regex_time;
    private static final String L = "(";
    private static final String R = ")";
    private static final String O = "|";

    static{
        createRegexDate();
        createRegexTime();
    }

    /***
     *  截取字符串里的日期
     * @param input
     * @return
     */
    private static String[] get(String regex,String input){
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(input);
        String s[] = new String[10];
        int len = 0;
        while(matcher.find()){
            s[len] = matcher.group();
            len++;
        }
        return Arrays.copyOf(s, len);
    }


    private static void createRegexTime() {
        String nh = "((1[0-9])|(2[0-4])|((0|)[0-9]))";
        String duan = "(上午|下午|晚上|凌晨|早上|夜晚|am|pm|)";
        String ch = "(((二|)(十|)[零,一,二,三,四,五,六,七,八,九,十]))";
        String ce = "(点|:)";
        String cme = "(分|)";
        String t1 = "((一刻)|(半)|([0-5][0-9]))"+cme;
        String t2 = duan+orRegex(nh,ch)+ce+orRegex(t1,"");
        regex_time = t2;
    }


    private static void createRegexDate() {
        final String C09 = "([0,1,2,3,4,5,6,7,8,9,十,一,二,三,四,五,六,七,八,九])";
        final String C19 = "([1,2,3,4,5,6,7,8,9,一,二,三,四,五,六,七,八,九])";
        final String C0 = "[0,十]";
        final String C1 = "[1,一]";
        final String C01 = "[0,十,1,一]";
        final String C02 = "[0,1,2,十,一,二]";
        final String C3 = "[3,三]";
        final String C12 = "[1,2,一,二]";
        final String C17 = "[1,2,3,4,5,6,7,一,二,三,四,五,六,日]";
        final String Y = C02+C09+C09+C09;
        final String M = orRegex(C09,C1+C02,orRegex(C0,"")+C19);
        final String D = orRegex(C12+C09,C3+C01,orRegex(C0,"")+C19);
        final String W = "((周|下周|本周)("+C17+"|末))|((下周星期|下礼拜|礼拜|下星期|星期)("+C17+"|天))";
        final String M_END1 = "月";
        final String D_END1 = "(日|号)";
        final String Y_END1 = "年";
        final String E = "(-|\\.)";
        String year = orRegex(Y+Y_END1,"");
        String year_month = year + M + M_END1;
        String day = D + D_END1;
        String day_week = orRegex(day,"") + W;
        String month_day = orRegex(orRegex(M+M_END1,"")+day,M+M_END1+D);
        String year_month_day = year_month + day;
        regex_date = orRegex(year_month_day,month_day,day_week,year_month);
        regex_date  = orRegex(regex_date,Y+E+M+E+D,Y+E+M,M+E+D);
    }


    public static Map<String,String> words(String input){
        String value = "t 1024";
        Map<String,String> map = new HashMap<>();
        put(map,get(regex_date,input),value);
        put(map,get(regex_time,input),value);
        return map;
    }

    private static void put(Map<String,String> map,String keys[],String value){
        for (int i = 0; i < keys.length; i++) {
            map.put(keys[i], value);
        }
    }

    private static String orRegex(String...args){
        String regex = "";
        for (int i = 0; i < args.length; i++) {
            regex = regex+O + L+args[i]+R;
        }
        return L+regex.substring(1)+R;
    }
}

最后构建的regex_date正则居然有2390多个字符,感觉自己的算法好挫啊.

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值