背景:朋友项目中需要需要对OCR识别营业执照上面的日期进行格式化,并做一些容错处理。如:“二〇一五年五月十二日” 转换成 “2015-5-12”
直接上代码 ↓↓↓
初始化字典
private static final char TEN = '十';
Map<Character, Integer> digitMap = new HashMap<>();
// 初始化字典
private void setupMap() {
digitMap.put('一', 1);
digitMap.put('二', 2);
digitMap.put('三', 3);
digitMap.put('四', 4);
digitMap.put('五', 5);
digitMap.put('六', 6);
digitMap.put('七', 7);
digitMap.put('八', 8);
digitMap.put('九', 9);
digitMap.put(TEN, 1);
// 兼容一些奇怪的识别结果
digitMap.put('0', 0);
digitMap.put('O', 0);
digitMap.put('o', 0);
digitMap.put('〇', 0);
digitMap.put('I', 1);
digitMap.put('l', 1);
digitMap.put('i', 1);
}
核心方法
// source: 需要转换的年、月、日字符串,如:二O一五
private String convert(String source) {
StringBuilder sb = new StringBuilder();
int index = 0;
int length = source.length();
for (char c: source.toCharArray()) {
// 对 “十” 做特殊处理
if (length == 1 && c == TEN) {
digitMap.put(TEN, 10);
} else if (index != 0 && c == TEN) {
digitMap.put(TEN, 0);
} else {
digitMap.put(TEN, 1);
}
if (index != 0 && index != length-1 && c == TEN) {
continue;
}
Integer digit = digitMap.get(c);
if (digit == null) {
System.err.println(MessageFormat.format("{0} is invalid.", c));
return null;
}
sb.append(digit.toString());
index++;
}
return sb.toString();
}
测试调用
@Test
public void test() {
setupMap();
String source = "二〇一五年五月十二日";
source = "二O一五年十一月三十一日";
// 判断是否包含"年"、"月"、"日"
int beginIndex = 0;
int yearIndex = source.indexOf("年");
int monthIndex = source.indexOf("月");
int dayIndex = source.indexOf("日");
if (-1 == yearIndex || -1 == monthIndex || -1 == dayIndex) {
System.err.println(MessageFormat.format("{0} is invalid.", source));
return;
}
String year = source.substring(beginIndex, yearIndex);
String month = source.substring(yearIndex + 1, monthIndex);
String day = source.substring(monthIndex + 1, dayIndex);
String result = MessageFormat.format("{0}-{1}-{2}", convert(year), convert(month), convert(day));
System.out.println(MessageFormat.format("result: {0}", result));
}