内容抓取匹配例子-中文数字转数字

package test;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ContentChecker {
	public enum CheckRule{
		$检查地区,
		$检查房型,
		$检查价格,
		$检查户型,
		$检查面积,
		$检查电话;
	}
	
	private static Map<String,String> _$1_rule_map = new HashMap<String,String>();
	private static Map<String,String> _$2_rule_map = new HashMap<String,String>();
	private static Map<String,String> _$3_rule_map = new HashMap<String,String>();
	private static Map<String,String> _$4_rule_map = new HashMap<String,String>();
	private static Map<String,String> _$5_rule_map = new HashMap<String,String>();
	private static Map<String,String> _$6_rule_map = new HashMap<String,String>();
	
	static{
		_$1_rule_map.put("亞凱迪亞", "Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪");
		_$1_rule_map.put("阿罕布拉市", "Alhambra|阿罕布拉|阿市|阿罕布拉市");
		_$1_rule_map.put("亞凱迪亞","亞凱迪亞|Arcadia|亚市|亞凱|阿凱迪亞|亞凱迪亞|亞凱迪");
		_$1_rule_map.put("聖蓋博","聖蓋博|San|Gabriel|聖市");
		_$1_rule_map.put("天普市","天普市|Temple|city|天普");
		_$1_rule_map.put("蒙羅維亞","蒙羅維亞|Monrovia");
		_$1_rule_map.put("蒙特利公園","蒙特利|Monterey|Park|蒙市");
		_$1_rule_map.put("阿罕布拉市","阿罕布拉|Alhambra|阿市|阿罕布拉市");
		_$1_rule_map.put("蒙地貝婁","蒙地貝婁|Montebello");
		_$1_rule_map.put("聖瑪利諾","聖瑪利諾|san|marino|聖市");
		_$1_rule_map.put("帕莎迪那","帕莎迪那|Pasadena|帕莎迪娜|帕市");
		_$1_rule_map.put("格蘭岱","格蘭岱|Glendale");
		_$1_rule_map.put("拉朋地","拉朋地|La Puente");
		_$1_rule_map.put("鮑溫公園","鮑溫公園|Baldwin|Park");
		_$1_rule_map.put("艾爾蒙地","艾爾蒙地|El|Monte");
		_$1_rule_map.put("柔似蜜","柔似蜜|Rosemead|柔市|柔似蜜");
		_$1_rule_map.put("奇諾崗","奇諾崗|chino|hills");
		_$1_rule_map.put("鑽石吧","鑽石吧|diamond|bar|corona");
		_$1_rule_map.put("工業市","工業市|City|of|Industry");
		_$1_rule_map.put("核桃市","核桃市|walnut");
		_$1_rule_map.put("哈崗","哈崗|Hacienda|Height");
		_$1_rule_map.put("羅蘭崗","羅蘭崗|Rowland|Heights");
		_$1_rule_map.put("西柯汶納","西柯汶納|west|covina");
		_$1_rule_map.put("波莫那","波莫那|Pomona");
		_$1_rule_map.put("聖迪瑪斯","聖迪瑪斯|San|Dimas");
		_$1_rule_map.put("克萊蒙","克萊蒙|Claremont");
		_$1_rule_map.put("厄浦蘭","厄浦蘭|upland");
		_$1_rule_map.put("赌城地产","赌城地产|Las|Vages");

		
		
		_$2_rule_map.put("康斗","康斗|Condo");
		_$2_rule_map.put("屋","独立屋|豪宅|House");
		_$2_rule_map.put("办公","办公室|Office");
		_$2_rule_map.put("仓库","仓库|Warehouse");
		_$2_rule_map.put("公寓","公寓|亚寓|Apartment");
		_$2_rule_map.put("移动","移动房屋|Mobile|Home");
		_$2_rule_map.put("旅馆","汽车旅馆|旅馆|Motel");
		_$2_rule_map.put("旺铺","旺铺|商铺|店|Store");
		_$2_rule_map.put("洗车","洗车行|Car|Wash");
		_$2_rule_map.put("加油","加油站|Gas|Station");
		_$2_rule_map.put("PUD","PUD|PUD");
		_$2_rule_map.put("土地","土地|Land");
		_$2_rule_map.put("其他","其他|Other");
		

		_$3_rule_map.put("价格:%s", "((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)?\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)+)|((售\\s*价|售|月\\s*租\\s*金|租\\s*金|月\\s*租|租|\\$)+\\s*\\d+\\s*(每\\s*月|月|\\/月|元|块)?)");
		
		
		_$4_rule_map.put("户型:%s", "(\\d+\\s*(室|厅|厨|卫))+");
		
		_$5_rule_map.put("面积:%s", "((地大|佔地|占地|近)?\\s*\\d+\\s*(余尺|尺|呎)+\\d*)");
		
		_$6_rule_map.put("电话:%s", "(\\d+-\\d+-\\d+)");
	}
	
	public static String getPropValueFromContent(CheckRule proprule, String content){
		
		String result = null;
		
		switch (proprule) {
		case $检查地区:
			result = matchInThis(_$1_rule_map,content);
			break;
		case $检查房型:
			result = findFirstInThis(_$2_rule_map,content);
			break;

		case $检查价格:
			result = findFirstInThis(_$3_rule_map,content);
			break;
		case $检查户型:
			result = findFirstInThis(_$4_rule_map,content);
			break;
		case $检查面积:
			result = findFirstInThis(_$5_rule_map,content);
			break;
		case $检查电话:
			result = findFirstInThis(_$6_rule_map,content);
			break;

		default:
			break;
		}
		return result;
	}
	

	public static void main(String[] args) {

		
		String testContent = "###  Vages 阿市 sadfsadaew 收到罚单 Condo 罚单撒旦法撒发是 速度 #### \r\n"
				+ "#### 租八万八#### \r\n"
				+ "#### 三尺三#地大八萬#地大八万三千#近5千尺## \r\n"
				+ "#### 租八百#### \r\n"
				+ "#### 两室一厅一厨一卫月租两百块#### \r\n"
				+ "#### 售    十亿壹 仟贰   佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整  #### \r\n"
				+ "#### 售 价 三千五百万### \r\n"
				+ "#### 电话 888-222-11111### \r\n"
				+ "#### 壹仟零贰元整#### \r\n"
				+ "#### $三十四万零二百每月#### \r\n"
				+ "#### $5010万/月##### \r\n";
		
		String newContent = ContentConvertUtil.ReplaceCNNumToInt(testContent);
		System.out.println(newContent);
		
		

		String find = getPropValueFromContent(CheckRule.$检查地区, newContent);
		System.out.println(String.format("$检查地区:[%s]", find));
		
		find = getPropValueFromContent(CheckRule.$检查房型, newContent);
		System.out.println(String.format("$检查房型:[%s]", find));
		
		
		find = getPropValueFromContent(CheckRule.$检查价格, newContent);
		System.out.println(String.format("$检查价格:[%s]", find));
		
		find = getPropValueFromContent(CheckRule.$检查户型, newContent);
		System.out.println(String.format("$检查户型:[%s]", find));
		
		
		find = getPropValueFromContent(CheckRule.$检查面积, newContent);
		System.out.println(String.format("$检查面积:[%s]", find));
		
		find = getPropValueFromContent(CheckRule.$检查电话, newContent);
		System.out.println(String.format("$检查电话:[%s]", find));
		
		
		
		
		
		
	}

	private static String findFirstInThis(Map<String, String> ruleMap, String content) {
		
		String result = null;
		
		Set<Entry<String, String>> entrySet = ruleMap.entrySet();
		for (Entry<String, String> kvp : entrySet) {
			String regex = String.format("%s", kvp.getValue());
			Pattern pattern = Pattern.compile(regex);
			Matcher matcher = pattern.matcher(content);
			
			String allStr = "";
			
			while(matcher.find()){
				allStr = allStr+"|"+matcher.group();
			}
			
			
			if(!"".equals(allStr)){
				
				result = String.format(kvp.getKey(),allStr);
				break;
			}
			
			
//			if(matcher.find()){
//				result = String.format(kvp.getKey(),matcher.group());
//				break;
//			}
		}
		return result;
	}

	private static String matchInThis(Map<String, String> ruleMap, String content) {
		
		String result = null;
		
		Set<Entry<String, String>> entrySet = ruleMap.entrySet();
		for (Entry<String, String> kvp : entrySet) {
			String regex = String.format("(\\s|\\S)*(%s)+(\\s|\\S)*", kvp.getValue());
			//System.out.println(String.format(">>>>>>>[%s]【%s】", regex,content));
			Pattern pattern = Pattern.compile(regex);
			Matcher matcher = pattern.matcher(content);
			if(matcher.matches()){
				result = kvp.getKey();
				break;
			}
		}
		return result;
	}
	
	
	

}

class ContentConvertUtil{
	
	/**
	 * 全文替换 中文混合数字 为 纯数字,适用月正整数,最大支持 999999999
	 * eg:十亿壹 仟贰   佰伍 拾叁 万陆 仟 柒 佰 捌 拾 玖 元 整
	 * @param chinaInt
	 * @return
	 */
	public static String ReplaceCNNumToInt(String chinaContent) {
		String regex = CNNumRegex+"+";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(chinaContent);
		StringBuffer sb = new StringBuffer();
		while(matcher.find()){
			Integer cnNumToInt = CNNumToInt(matcher.group());
			String replaceStr = "*";
			if(!cnNumToInt.equals(0)){
				replaceStr = cnNumToInt.toString();
			}
			//System.out.println("replace:["+matcher.group()+":"+replaceStr+"]");
			matcher.appendReplacement(sb,replaceStr);
		}
		matcher.appendTail(sb);
		return sb.toString();
	}
	
	/**
	 * 中文混合数字 转 纯数字,适用月正整数,最大支持 999999999
	 * @param chinaInt
	 * @return
	 */
	public static Integer CNNumToInt(String chinaInt) {
		String regex = CNNumRegex;
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(chinaInt);
		
		
		List<Integer> resultSplit = new ArrayList<Integer>();
		
		Integer currVal = 0;
		Integer prevUnit = 1;
		while(matcher.find()){
			
			if(matcher.groupCount()==6){
				
				//System.out.println("############matcher:["+matcher.group(2)+"]["+matcher.group(5)+"]");
				
				String numValStr = matcher.group(2);
				String numUnitStr = matcher.group(5).replaceAll("\\s", "");
				
				Integer numVal = 0;
				Integer numUnit = 1;
				
				if(isNum(numValStr)){
					numVal = Integer.parseInt(numValStr);
				}else{
					numVal = numMap.get(numValStr);
				}
				numVal = numVal==null?0:numVal;
				
				if("".equals(numUnitStr)){
					numUnit = prevUnit>1?prevUnit/10:1;
				}else{
					numUnit = unitMap.get(numUnitStr);
				}
				numUnit = numUnit==null?1:numUnit;
				prevUnit = numUnit;
				
				if(numUnit>=100000000){
					currVal = currVal*100000000 + (numVal*numUnit);
					//System.out.println(">>>>>currVal["+currVal+"]");
					resultSplit.add(currVal);
					currVal = 0;
				}else if(numUnit>=10000&&numUnit<100000000){
					currVal = currVal*10000 + (numVal*numUnit);
					//System.out.println(">>>>>currVal["+currVal+"]");
					resultSplit.add(currVal);
					currVal = 0;
				}else if(1>=1&&numUnit<10000){
					currVal = currVal*1 + (numVal*numUnit);
					//System.out.println(">>>>>currVal["+currVal+"]");
				}
			}
		}
		resultSplit.add(currVal);
		Integer result = 0;
		for (Integer val : resultSplit) {
			//System.out.println("######resultSplit["+val+"]");
			result = result+val;
		}
		//System.out.println("result["+result+"]");
		return result;
	}
	private static boolean isNum(String target){
		target = target.replaceAll("\\s", "");
		if(Pattern.compile("\\d+").matcher(target).matches()){
			return true;
		}
		return false;
	}
	private static String CNNumRegex = "(((零|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|〇|一|二|三|四|五|六|七|八|九|十|两)|(\\d+))\\s*((十|拾|百|佰|千|仟|万|萬|亿|\\s)*))";
	private static Map<String,Integer> numMap = new HashMap<String,Integer>();
	private static Map<String,Integer> unitMap = new HashMap<String,Integer>();
	
	static{
		numMap.put("零", 0);
		numMap.put("壹", 1);
		numMap.put("贰", 2);
		numMap.put("叁", 3);
		numMap.put("肆", 4);
		numMap.put("伍", 5);
		numMap.put("陆", 6);
		numMap.put("柒", 7);
		numMap.put("捌", 8);
		numMap.put("玖", 9);
		numMap.put("拾",10);
		numMap.put("〇", 0);
		numMap.put("一", 1);
		numMap.put("二", 2);
		numMap.put("三", 3);
		numMap.put("四", 4);
		numMap.put("五", 5);
		numMap.put("六", 6);
		numMap.put("七", 7);
		numMap.put("八", 8);
		numMap.put("九", 9);
		numMap.put("十", 10);
		numMap.put("两", 2);
	}
	
	static{
		unitMap.put("十",   10);
		unitMap.put("百",   100);
		unitMap.put("千",   1000);
		unitMap.put("万",   10000);
		unitMap.put("十万", 100000);
		unitMap.put("百万", 1000000);
		unitMap.put("千万", 10000000);
		unitMap.put("拾万", 100000);
		unitMap.put("佰万", 1000000);
		unitMap.put("仟万", 10000000);
		unitMap.put("拾",   10);
		unitMap.put("佰",   100);
		unitMap.put("仟",   1000);
		unitMap.put("萬",   10000);
		unitMap.put("拾萬", 100000);
		unitMap.put("佰萬", 1000000);
		unitMap.put("仟萬", 10000000);
		unitMap.put("十萬", 100000);
		unitMap.put("百萬", 1000000);
		unitMap.put("千萬", 10000000);
		unitMap.put("亿",   100000000);
		unitMap.put("拾亿", 1000000000);
		unitMap.put("十亿", 1000000000);
		unitMap.put(null, 1);
	}
}

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值