package test;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ContentChecker {
public enum CheckRule{
$检查地区,
$检查房型,
$检查价格,
$检查户型,
$检查面积,
$检查电话;
}
private static Map<String,String> _$1_rule_map = new HashMap<String,String>();
private static Map<String,String> _$2_rule_map = new HashMap<String,String>();
private static Map<String,String> _$3_rule_map = new HashMap<String,String>();
private static Map<String,String> _$4_rule_map = new HashMap<String,String>();
private static Map<String,String> _$5_rule_map = new HashMap<String,String>();
private static Map<String,String> _$6_rule_map = new HashMap<String,String>();
static{
_$1_rule_map.put("亞凱迪亞", "Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪");
_$1_rule_map.put("阿罕布拉市", "Alhambra|阿罕布拉|阿市|阿罕布拉市");
_$1_rule_map.put("亞凱迪亞","亞凱迪亞|Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪");
_$1_rule_map.put("聖蓋博","聖蓋博|San|Gabriel|聖市");
_$1_rule_map.put("天普市","天普市|Temple|city|天普");
_$1_rule_map.put("蒙羅維亞","蒙羅維亞|Monrovia");
_$1_rule_map.put("蒙特利公園","蒙特利|Monterey|Park|蒙市");
_$1_rule_map.put("阿罕布拉市","阿罕布拉|Alhambra|阿市|阿罕布拉市");
_$1_rule_map.put("蒙地貝婁","蒙地貝婁|Montebello");
_$1_rule_map.put("聖瑪利諾","聖瑪利諾|san|marino|聖市");
_$1_rule_map.put("帕莎迪那","帕莎迪那|Pasadena|帕莎迪娜|帕市");
_$1_rule_map.put("格蘭岱","格蘭岱|Glendale");
_$1_rule_map.put("拉朋地","拉朋地|La Puente");
_$1_rule_map.put("鮑溫公園","鮑溫公園|Baldwin|Park");
_$1_rule_map.put("艾爾蒙地","艾爾蒙地|El|Monte");
_$1_rule_map.put("柔似蜜","柔似蜜|Rosemead|柔市|柔似蜜");
_$1_rule_map.put("奇諾崗","奇諾崗|chino|hills");
_$1_rule_map.put("鑽石吧","鑽石吧|diamond|bar|corona");
_$1_rule_map.put("工業市","工業市|City|of|Industry");
_$1_rule_map.put("核桃市","核桃市|walnut");
_$1_rule_map.put("哈崗","哈崗|Hacienda|Height");
_$1_rule_map.put("羅蘭崗","羅蘭崗|Rowland|Heights");
_$1_rule_map.put("西柯汶納","西柯汶納|west|covina");
_$1_rule_map.put("波莫那","波莫那|Pomona");
_$1_rule_map.put("聖迪瑪斯","聖迪瑪斯|San|Dimas");
_$1_rule_map.put("克萊蒙","克萊蒙|Claremont");
_$1_rule_map.put("厄浦蘭","厄浦蘭|upland");
_$1_rule_map.put("赌城地产","赌城地产|Las|Vages");
_$2_rule_map.put("康斗","康斗|Condo");
_$2_rule_map.put("屋","独立屋|豪宅|House");
_$2_rule_map.put("办公","办公室|Office");
_$2_rule_map.put("仓库","仓库|Warehouse");
_$2_rule_map.put("公寓","公寓|亚寓|Apartment");
_$2_rule_map.put("移动","移动房屋|Mobile|Home");
_$2_rule_map.put("旅馆","汽车旅馆|旅馆|Motel");
_$2_rule_map.put("旺铺","旺铺|商铺|店|Store");
_$2_rule_map.put("洗车","洗车行|Car|Wash");
_$2_rule_map.put("加油","加油站|Gas|Station");
_$2_rule_map.put("PUD","PUD|PUD");
_$2_rule_map.put("土地","土地|Land");
_$2_rule_map.put("其他","其他|Other");
_$3_rule_map.put("价格:%s", "((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)?\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)+)|((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)+\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)?)");
_$4_rule_map.put("户型:%s", "(\\d+\\s*(室|厅|厨|卫))+");
_$5_rule_map.put("面积:%s", "((地大|佔地|占地|近)?\\s*\\d+\\s*(余尺|尺|呎)+\\d*)");
_$6_rule_map.put("电话:%s", "(\\d+-\\d+-\\d+)");
}
public static String getPropValueFromContent(CheckRule proprule, String content){
String result = null;
switch (proprule) {
case $检查地区:
result = matchInThis(_$1_rule_map,content);
break;
case $检查房型:
result = findFirstInThis(_$2_rule_map,content);
break;
case $检查价格:
result = findFirstInThis(_$3_rule_map,content);
break;
case $检查户型:
result = findFirstInThis(_$4_rule_map,content);
break;
case $检查面积:
result = findFirstInThis(_$5_rule_map,content);
break;
case $检查电话:
result = findFirstInThis(_$6_rule_map,content);
break;
default:
break;
}
return result;
}
public static void main(String[] args) {
String testContent = "### Vages 阿市 sadfsadaew 收到罚单 Condo 罚单撒旦法撒发是 速度 #### \r\n"
+ "#### 租八万八#### \r\n"
+ "#### 三尺三#地大八萬#地大八万三千#近5千尺## \r\n"
+ "#### 租八百#### \r\n"
+ "#### 两室一厅一厨一卫月租两百块#### \r\n"
+ "#### 售 十亿壹 仟贰 佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整 #### \r\n"
+ "#### 售 价 三千五百万### \r\n"
+ "#### 电话 888-222-11111### \r\n"
+ "#### 壹仟零贰元整#### \r\n"
+ "#### $三十四万零二百每月#### \r\n"
+ "#### $5010万/月##### \r\n";
String newContent = ContentConvertUtil.ReplaceCNNumToInt(testContent);
System.out.println(newContent);
String find = getPropValueFromContent(CheckRule.$检查地区, newContent);
System.out.println(String.format("$检查地区:[%s]", find));
find = getPropValueFromContent(CheckRule.$检查房型, newContent);
System.out.println(String.format("$检查房型:[%s]", find));
find = getPropValueFromContent(CheckRule.$检查价格, newContent);
System.out.println(String.format("$检查价格:[%s]", find));
find = getPropValueFromContent(CheckRule.$检查户型, newContent);
System.out.println(String.format("$检查户型:[%s]", find));
find = getPropValueFromContent(CheckRule.$检查面积, newContent);
System.out.println(String.format("$检查面积:[%s]", find));
find = getPropValueFromContent(CheckRule.$检查电话, newContent);
System.out.println(String.format("$检查电话:[%s]", find));
}
private static String findFirstInThis(Map<String, String> ruleMap, String content) {
String result = null;
Set<Entry<String, String>> entrySet = ruleMap.entrySet();
for (Entry<String, String> kvp : entrySet) {
String regex = String.format("%s", kvp.getValue());
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
String allStr = "";
while(matcher.find()){
allStr = allStr+"|"+matcher.group();
}
if(!"".equals(allStr)){
result = String.format(kvp.getKey(),allStr);
break;
}
// if(matcher.find()){
// result = String.format(kvp.getKey(),matcher.group());
// break;
// }
}
return result;
}
private static String matchInThis(Map<String, String> ruleMap, String content) {
String result = null;
Set<Entry<String, String>> entrySet = ruleMap.entrySet();
for (Entry<String, String> kvp : entrySet) {
String regex = String.format("(\\s|\\S)*(%s)+(\\s|\\S)*", kvp.getValue());
//System.out.println(String.format(">>>>>>>[%s]【%s】", regex,content));
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
if(matcher.matches()){
result = kvp.getKey();
break;
}
}
return result;
}
}
class ContentConvertUtil{
/**
* 全文替换 中文混合数字 为 纯数字,适用月正整数,最大支持 999999999
* eg:十亿壹 仟贰 佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整
* @param chinaInt
* @return
*/
public static String ReplaceCNNumToInt(String chinaContent) {
String regex = CNNumRegex+"+";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(chinaContent);
StringBuffer sb = new StringBuffer();
while(matcher.find()){
Integer cnNumToInt = CNNumToInt(matcher.group());
String replaceStr = "*";
if(!cnNumToInt.equals(0)){
replaceStr = cnNumToInt.toString();
}
//System.out.println("replace:["+matcher.group()+":"+replaceStr+"]");
matcher.appendReplacement(sb,replaceStr);
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* 中文混合数字 转 纯数字,适用月正整数,最大支持 999999999
* @param chinaInt
* @return
*/
public static Integer CNNumToInt(String chinaInt) {
String regex = CNNumRegex;
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(chinaInt);
List<Integer> resultSplit = new ArrayList<Integer>();
Integer currVal = 0;
Integer prevUnit = 1;
while(matcher.find()){
if(matcher.groupCount()==6){
//System.out.println("############matcher:["+matcher.group(2)+"]["+matcher.group(5)+"]");
String numValStr = matcher.group(2);
String numUnitStr = matcher.group(5).replaceAll("\\s", "");
Integer numVal = 0;
Integer numUnit = 1;
if(isNum(numValStr)){
numVal = Integer.parseInt(numValStr);
}else{
numVal = numMap.get(numValStr);
}
numVal = numVal==null?0:numVal;
if("".equals(numUnitStr)){
numUnit = prevUnit>1?prevUnit/10:1;
}else{
numUnit = unitMap.get(numUnitStr);
}
numUnit = numUnit==null?1:numUnit;
prevUnit = numUnit;
if(numUnit>=100000000){
currVal = currVal*100000000 + (numVal*numUnit);
//System.out.println(">>>>>currVal["+currVal+"]");
resultSplit.add(currVal);
currVal = 0;
}else if(numUnit>=10000&&numUnit<100000000){
currVal = currVal*10000 + (numVal*numUnit);
//System.out.println(">>>>>currVal["+currVal+"]");
resultSplit.add(currVal);
currVal = 0;
}else if(1>=1&&numUnit<10000){
currVal = currVal*1 + (numVal*numUnit);
//System.out.println(">>>>>currVal["+currVal+"]");
}
}
}
resultSplit.add(currVal);
Integer result = 0;
for (Integer val : resultSplit) {
//System.out.println("######resultSplit["+val+"]");
result = result+val;
}
//System.out.println("result["+result+"]");
return result;
}
private static boolean isNum(String target){
target = target.replaceAll("\\s", "");
if(Pattern.compile("\\d+").matcher(target).matches()){
return true;
}
return false;
}
private static String CNNumRegex = "(((零|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|〇|一|二|三|四|五|六|七|八|九|十|两)|(\\d+))\\s*((十|拾|百|佰|千|仟|万|萬|亿|\\s)*))";
private static Map<String,Integer> numMap = new HashMap<String,Integer>();
private static Map<String,Integer> unitMap = new HashMap<String,Integer>();
static{
numMap.put("零", 0);
numMap.put("壹", 1);
numMap.put("贰", 2);
numMap.put("叁", 3);
numMap.put("肆", 4);
numMap.put("伍", 5);
numMap.put("陆", 6);
numMap.put("柒", 7);
numMap.put("捌", 8);
numMap.put("玖", 9);
numMap.put("拾",10);
numMap.put("〇", 0);
numMap.put("一", 1);
numMap.put("二", 2);
numMap.put("三", 3);
numMap.put("四", 4);
numMap.put("五", 5);
numMap.put("六", 6);
numMap.put("七", 7);
numMap.put("八", 8);
numMap.put("九", 9);
numMap.put("十", 10);
numMap.put("两", 2);
}
static{
unitMap.put("十", 10);
unitMap.put("百", 100);
unitMap.put("千", 1000);
unitMap.put("万", 10000);
unitMap.put("十万", 100000);
unitMap.put("百万", 1000000);
unitMap.put("千万", 10000000);
unitMap.put("拾万", 100000);
unitMap.put("佰万", 1000000);
unitMap.put("仟万", 10000000);
unitMap.put("拾", 10);
unitMap.put("佰", 100);
unitMap.put("仟", 1000);
unitMap.put("萬", 10000);
unitMap.put("拾萬", 100000);
unitMap.put("佰萬", 1000000);
unitMap.put("仟萬", 10000000);
unitMap.put("十萬", 100000);
unitMap.put("百萬", 1000000);
unitMap.put("千萬", 10000000);
unitMap.put("亿", 100000000);
unitMap.put("拾亿", 1000000000);
unitMap.put("十亿", 1000000000);
unitMap.put(null, 1);
}
}