java html 过滤,JAVA过滤html标签

过滤URL网址,邮箱地址,html标签,JS代码,各种转义字符:

public static String killTags(String news) {

if(news==null){

return "";

}

String s = news.replaceAll("amp;", "");

if(s==null){

return "";

}

s =s.replaceAll("

if(s==null){

return "";

}

s =s.replaceAll(">", ">");

if(s==null){

return "";

}

/*

* 过滤CSS样式

*/

Pattern pattern = Pattern.compile(

"|",

Pattern.DOTALL);

Matcher matcher = pattern.matcher(s);

String str = matcher.replaceAll("");

/*

* 过滤HTML标签

*/

Pattern pattern2 = Pattern.compile("(]+>)", Pattern.DOTALL);

Matcher matcher2 = pattern2.matcher(str);

String strhttp = matcher2.replaceAll(" ");

/*

* 过滤URL网址

*/

String regEx = "(((http|https|ftp)(\\s)*((\\:)|:))(\\s)*(//|//)(\\s)*)?"

+ "([\\sa-zA-Z0-9(\\.|.)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|.)&%\\$\\-]+)*@(\\s)*)?"

+ "("

+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"

+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"

+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"

+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"

+ "|([\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*[\\sa-zA-Z]*"

+ ")"

+ "((\\s)*(\\:)|(:)(\\s)*[0-9]+)?"

+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";

Pattern p1 = Pattern.compile(regEx, Pattern.DOTALL);

String[] subs = strhttp.split(" ");

StringBuffer buf = new StringBuffer();

for(String strElement:subs){

Matcher matchhttp = p1.matcher(strElement);

String temp = matchhttp.replaceAll("");

buf.append(temp);

buf.append(" ");

}

String strnew = buf.toString().replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");

/*

* 过滤标点符号

*/

Pattern patterncomma = Pattern.compile("(&[^;]+;)", Pattern.DOTALL);

Matcher matchercomma = patterncomma.matcher(strnew);

String strout = matchercomma.replaceAll(" ").replaceAll("\\pP", " ");

return strout;

}

JAVA去除新闻来源和记者信息(需要用到ANSJ进行词性判断):

// delete where news come from

public static String killFrom(String content) {

if (content == null) {

return "";

}

String answer = "";

String reg = "(^| )[\u4E00-\u9FA5]*(网|社|报)[\\S]*(报道|电|讯)";

answer = content.replaceAll(reg, "");

reg = "(^| )[\\S]*(通讯员|记者)[\\S]*";

Pattern pattern = Pattern.compile(reg);

Matcher matcher = pattern.matcher(answer);

int flag = 0;

String match = "";

int start = 0;

int end = 0;

while (matcher.find()) {

start = matcher.start();

end = matcher.end();

match = answer.substring(start, end);

List parse = ToAnalysis.parse(match);

for (Term t : parse) {

if ("nr".equals(t.getNatrue().natureStr)

|| "nrf".equals(t.getNatrue().natureStr)) {

flag++;

}

}

if (flag > 0) {

break;

}

}

String left = answer.substring(0, start);

String right = answer.substring(end, answer.length());

if (flag > 0 && (end - start <= 5 * flag + 5)) {

answer = left + " " + right;

}

return answer;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值