例子
正则
re="<(\\s)*script[^>]*>([\\s\\S](?!";
复制代码
代码
public class FilterHTMLTags {
public static String HtmlText(String inputString) {
String htmlStr = inputString; //含html标签的字符串
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try {
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或]*?>[\\s\\S]*?<\\/script> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或]*?>[\\s\\S]*?<\\/style> }
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); //过滤script标签
p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); //过滤style标签
p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); //过滤html标签
/* 空格 —— */
// p_html = Pattern.compile("\\ ", Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = htmlStr.replaceAll(" "," ");
textStr = htmlStr;
}catch(Exception e) {
}
return textStr;
}
}
复制代码
过滤URL网址,邮箱地址,html标签,JS代码,各种转义字符
public static final String Upset = " ";
public static String killTags(String news) {
String s = news.replaceAll("amp;", "").replaceAll("<","<").replaceAll(">", ">");
Pattern pattern = Pattern.compile("<(span)?\\sstyle.*?style>|(span)?\\sstyle=.*?>", Pattern.DOTALL);
Matcher matcher = pattern.matcher(s);
String str = matcher.replaceAll("");
Pattern pattern2 = Pattern.compile("(<[^>]+>)",Pattern.DOTALL);
Matcher matcher2 = pattern2.matcher(str);
String strhttp = matcher2.replaceAll(" ");
String regEx = "(((http|https|ftp)(\\s)*((\\:)|:))(\\s)*(//|//)(\\s)*)?"
+ "([\\sa-zA-Z0-9(\\.|.)(\\s)*\\-]+((\\:)|(:)[\\sa-zA-Z0-9(\\.|.)&%\\$\\-]+)*@(\\s)*)?"
+ "("
+ "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)"
+ "(\\.|.)(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"
+ "|([\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*)*[\\sa-zA-Z0-9\\-]+(\\.|.)(\\s)*[\\sa-zA-Z]*"
+ ")"
+ "((\\s)*(\\:)|(:)(\\s)*[0-9]+)?"
+ "(/(\\s)*[^/][\\sa-zA-Z0-9\\.\\,\\?\\'\\\\/\\+&%\\$\\=~_\\-@]*)*";
Pattern p1 = Pattern.compile(regEx,Pattern.DOTALL);
Matcher matchhttp = p1.matcher(strhttp);
String strnew = matchhttp.replaceAll("").replaceAll("(if[\\s]*\\(|else|elseif[\\s]*\\().*?;", " ");
Pattern patterncomma = Pattern.compile("(&[^;]+;)",Pattern.DOTALL);
Matcher matchercomma = patterncomma.matcher(strnew);
String strout = matchercomma.replaceAll(" ");
String answer = strout.replaceAll("[\\pP‘’“”]", " ")
.replaceAll("\r", " ").replaceAll("\n", " ")
.replaceAll("\\s", " ").replaceAll(Upset, "");
return answer;
}
复制代码