过滤URL网址,邮箱地址,html标签,JS代码,各种转义字符:
public static String killTags(String news) {
if(news==null){
return "";
}
String s = news.replaceAll("amp;", "");
if(s==null){
return "";
}
s =s.replaceAll("
if(s==null){
return "";
}
s =s.replaceAll(">", ">");
if(s==null){
return "";
}
/*
* 过滤CSS样式
*/
Pattern pattern = Pattern.compile(
"|",
Pattern.DOTALL);
Matcher matcher = pattern.matcher(s);
String str = matcher.replaceAll("");
/*
* 过滤HTML标签
*/
Pattern pattern2 = Pattern.compile("(]+>)", Pattern.DOTALL);
Matcher matcher2 = pattern2.matcher(str);
String strhttp = matcher2.replaceAll(" ");
/*
* 过滤URL网址
*/
String regEx = "(((http|https|ftp)(\\s)*((\\:)|:))(\\s)*(//|//)(\\s)*)?"
+ "([\\sa-zA-Z0-9(\\.|.)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|.)&%\\$\\-]+)*@(\\s)*)?"
+ "("
+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"
+ "|([\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*[\\sa-zA-Z]*"
+ ")"
+ "((\\s)*(\\:)|(:)(\\s)*[0-9]+)?"
+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";
Pattern p1 = Pattern.compile(regEx, Pattern.DOTALL);
String[] subs = strhttp.split(" ");
StringBuffer buf = new StringBuffer();
for(String strElement:subs){
Matcher matchhttp = p1.matcher(strElement);
String temp = matchhttp.replaceAll("");
buf.append(temp);
buf.append(" ");
}
String strnew = buf.toString().replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");
/*
* 过滤标点符号
*/
Pattern patterncomma = Pattern.compile("(&[^;]+;)", Pattern.DOTALL);
Matcher matchercomma = patterncomma.matcher(strnew);
String strout = matchercomma.replaceAll(" ").replaceAll("\\pP", " ");
return strout;
}
JAVA去除新闻来源和记者信息(需要用到ANSJ进行词性判断):
// delete where news come from
public static String killFrom(String content) {
if (content == null) {
return "";
}
String answer = "";
String reg = "(^| )[\u4E00-\u9FA5]*(网|社|报)[\\S]*(报道|电|讯)";
answer = content.replaceAll(reg, "");
reg = "(^| )[\\S]*(通讯员|记者)[\\S]*";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(answer);
int flag = 0;
String match = "";
int start = 0;
int end = 0;
while (matcher.find()) {
start = matcher.start();
end = matcher.end();
match = answer.substring(start, end);
List parse = ToAnalysis.parse(match);
for (Term t : parse) {
if ("nr".equals(t.getNatrue().natureStr)
|| "nrf".equals(t.getNatrue().natureStr)) {
flag++;
}
}
if (flag > 0) {
break;
}
}
String left = answer.substring(0, start);
String right = answer.substring(end, answer.length());
if (flag > 0 && (end - start <= 5 * flag + 5)) {
answer = left + " " + right;
}
return answer;
}