写这种算法真的是累啊,我是没辙,只能拿正则了,各位仁兄有何高见?
测试效果:
public class KeyWordsTest {
String [] input = {
"2017年9月16号",
"2017年10月3日",
"2015年7月",
"9月17号",
"4月15",
"6号",
"星期3",
"下礼拜5",
"本周2",
"周末",
"下周星期5",
"星期天",
"2015-9-7",
"9-7",
"2019.9.8",
"2019.8",
"9.7",
"二零一六年九月十八号",
"三月七号",
"十月二十",
"星期六",
"下礼拜五,我打算回上海",
"今天十五号,明天十六号",
"十月一号国庆节,我们打算回家",
"下午12:56",
"9:30",
"09:31",
"上午七点",
"下午5点",
"十点半",
"今天晚上九点15",
"十点一刻",
"二十三点50",
"十一点",
"以下是2017年4月10日入住,五道口附近的快捷酒店信息(与用户意图最相关的5个快捷酒店信息),请问您想预定哪一个?",
"5"//不能匹配
};
@Test
public void testDate() {
for (int i = 0; i < input.length; i++) {
System.out.print(input[i]+": ");
System.out.println(TimeWords.words(input[i]));
}
}
}
public class TimeWords {
private static String regex_date;
private static String regex_time;
private static final String L = "(";
private static final String R = ")";
private static final String O = "|";
static{
createRegexDate();
createRegexTime();
}
/***
* 截取字符串里的日期
* @param input
* @return
*/
private static String[] get(String regex,String input){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(input);
String s[] = new String[10];
int len = 0;
while(matcher.find()){
s[len] = matcher.group();
len++;
}
return Arrays.copyOf(s, len);
}
private static void createRegexTime() {
String nh = "((1[0-9])|(2[0-4])|((0|)[0-9]))";
String duan = "(上午|下午|晚上|凌晨|早上|夜晚|am|pm|)";
String ch = "(((二|)(十|)[零,一,二,三,四,五,六,七,八,九,十]))";
String ce = "(点|:)";
String cme = "(分|)";
String t1 = "((一刻)|(半)|([0-5][0-9]))"+cme;
String t2 = duan+orRegex(nh,ch)+ce+orRegex(t1,"");
regex_time = t2;
}
private static void createRegexDate() {
final String C09 = "([0,1,2,3,4,5,6,7,8,9,十,一,二,三,四,五,六,七,八,九])";
final String C19 = "([1,2,3,4,5,6,7,8,9,一,二,三,四,五,六,七,八,九])";
final String C0 = "[0,十]";
final String C1 = "[1,一]";
final String C01 = "[0,十,1,一]";
final String C02 = "[0,1,2,十,一,二]";
final String C3 = "[3,三]";
final String C12 = "[1,2,一,二]";
final String C17 = "[1,2,3,4,5,6,7,一,二,三,四,五,六,日]";
final String Y = C02+C09+C09+C09;
final String M = orRegex(C09,C1+C02,orRegex(C0,"")+C19);
final String D = orRegex(C12+C09,C3+C01,orRegex(C0,"")+C19);
final String W = "((周|下周|本周)("+C17+"|末))|((下周星期|下礼拜|礼拜|下星期|星期)("+C17+"|天))";
final String M_END1 = "月";
final String D_END1 = "(日|号)";
final String Y_END1 = "年";
final String E = "(-|\\.)";
String year = orRegex(Y+Y_END1,"");
String year_month = year + M + M_END1;
String day = D + D_END1;
String day_week = orRegex(day,"") + W;
String month_day = orRegex(orRegex(M+M_END1,"")+day,M+M_END1+D);
String year_month_day = year_month + day;
regex_date = orRegex(year_month_day,month_day,day_week,year_month);
regex_date = orRegex(regex_date,Y+E+M+E+D,Y+E+M,M+E+D);
}
public static Map<String,String> words(String input){
String value = "t 1024";
Map<String,String> map = new HashMap<>();
put(map,get(regex_date,input),value);
put(map,get(regex_time,input),value);
return map;
}
private static void put(Map<String,String> map,String keys[],String value){
for (int i = 0; i < keys.length; i++) {
map.put(keys[i], value);
}
}
private static String orRegex(String...args){
String regex = "";
for (int i = 0; i < args.length; i++) {
regex = regex+O + L+args[i]+R;
}
return L+regex.substring(1)+R;
}
}
最后构建的regex_date正则居然有2390多个字符,感觉自己的算法好挫啊.