之前乱码老是重复出现,老大建议根据ASCII码表及扩展ASCII码表来进行过滤。。
ASCII码表及其扩展表:
ASCII码可显示字符都是需要显示的,在这范围外的就可以过滤掉,但是又因为法语或者西班牙语的某些字符在扩展表里面,所以就分为了两种。。
/**
* This method is to filter special symbols in the description or road name. Before using this method.
* Please make sure the content is UpperCase or LowerCase you want, because this method will NOT change the UpperCase or LowerCase.
* @param content: description or road name.
* @param isDesc: distinguish between description(set true) and road name(set false).
* @param isEnglish: distinguish between English(set true) and French(set false) or Spanish(set false).
* @return new content without any special symbols
*/
public static String filterSpecialSymbols(String content, boolean isDesc, boolean isEnglish) {
if(content == null || "".equals(content)){
return "";
}
String newStr = "";
if(isDesc){
newStr = mainCodeOfFilterSymbols(content, isEnglish);
}else{
if(content.matches("(?:\\d|[a-zA-Z]|\\s|-|'|\\.)+")){
return content;
}else{
newStr = mainCodeOfFilterSymbols(content, isEnglish);
}
}
newStr = newStr.replaceAll("\\s+", " ").trim();
return newStr;
}
/**
* specific logic of method filterSpecialSymbols(String content, boolean isDesc, boolean isEnglish).
* Please don't use this method directly.
* @param content: description or road name.
* @param isEnglish: distinguish between English(set true) and French(set false) or Spanish(set false).
* @return new content without any special symbols
*/
public static String mainCodeOfFilterSymbols(String content, boolean isEnglish) {
String contentCopy = content;
for(int i=0;i<content.length();i++){
char tempChar = content.charAt(i);
String charStr = convertStrToUnicode(String.valueOf(tempChar));
charStr = charStr.replace("\\u", "").trim();
BigInteger num = new BigInteger(charStr, 16);
String numStr = String.valueOf(num);
if(numStr.length() > 3){
contentCopy = contentCopy.replaceAll(String.valueOf(tempChar), " ").replaceAll("\\s+", " ").trim();
}else{
int numNew = Integer.parseInt(numStr);
if(isEnglish){
if((numNew >= 32 && numNew <= 126)){
//no operation
}else{
contentCopy = contentCopy.replaceAll(String.valueOf(tempChar), " ").replaceAll("\\s+", " ").trim();
}
}else{
if((numNew >= 32 && numNew <= 126) || (numNew >= 192 && numNew <=255)){
//no operation
}else{
contentCopy = contentCopy.replaceAll(String.valueOf(tempChar), " ").replaceAll("\\s+", " ").trim();
}
}
}
}
return contentCopy;
}
/**
* convert string str to unicode
* @param str: str(type: String)
* @return unicode from str
*/
public static String convertStrToUnicode(String str) {
str = (str == null ? "" : str);
String tmp;
StringBuffer sb = new StringBuffer(1000);
char c;
int i, j;
sb.setLength(0);
for (i = 0; i < str.length(); i++) {
c = str.charAt(i);
sb.append("\\u");
j = (c >>>8); //取出高8位
tmp = Integer.toHexString(j);
if (tmp.length() == 1)
sb.append("0");
sb.append(tmp);
j = (c & 0xFF); //取出低8位
tmp = Integer.toHexString(j);
if (tmp.length() == 1)
sb.append("0");
sb.append(tmp);
}
return (new String(sb));
}
记录一下。。。