转自:https://blog.csdn.net/chenyidong521/article/details/73742301
package cn.pconline.best.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlUtil {
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; /* 定义script的正则表达式 */
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; /* 定义style的正则表达式 */
// private static final String regEx_html = "<[^>]+>"; /* 定义HTML标签的正则表达式<[^>]*> */
private static final String regEx_html = "(?!<(img).*?>)<.*?>"; /* 定义HTML标签的正则表达式<[^>]*> 保留img标签 */
private static final String regEx_space = "<a>\\s*|\t|\r|\n</a>"; /* 定义空格回车换行符 */
private static final String regEx_span = "/<span[^>]*>([^<]*)<\\/span>/ig"; /* span*/
private static final String regEx_strong = "<strong.*?>(((?!<\\/strong>).)*)<\\/strong>"; /* strong*/
public static String deleteHTMLTag( String htmlStr )
{
/* 去掉script标签 */
Pattern p_script = Pattern.compile( regEx_script,
Pattern.CASE_INSENSITIVE );
Matcher m_script = p_script.matcher( htmlStr );
htmlStr = m_script.replaceAll( "" ); /* 过滤script标签 */
/* 去掉style标签 */
Pattern p_style = Pattern
.compile( regEx_style, Pattern.CASE_INSENSITIVE );
Matcher m_style = p_style.matcher( htmlStr );
htmlStr = m_style.replaceAll( "" ); /* 过滤style标签 */
/* 去掉html标签 */
Pattern p_html = Pattern.compile( regEx_html, Pattern.CASE_INSENSITIVE );
Matcher m_html = p_html.matcher( htmlStr );
htmlStr = m_html.replaceAll( "" ); /* 过滤html标签 */
/* 去掉span标签 */
Pattern span_html = Pattern.compile( regEx_span, Pattern.CASE_INSENSITIVE );
Matcher span = span_html.matcher( htmlStr );
htmlStr = span.replaceAll( "" ); /* 过滤html标签 */
/* 去掉regEx_strong标签 */
Pattern strong_html = Pattern.compile( regEx_strong, Pattern.CASE_INSENSITIVE );
Matcher strong = strong_html.matcher( htmlStr );
htmlStr = strong.replaceAll( "" ); /* 过滤html标签 */
/* 去掉空格 */
Pattern p_space = Pattern
.compile( regEx_space, Pattern.CASE_INSENSITIVE );
Matcher m_space = p_space.matcher( htmlStr );
htmlStr = m_space.replaceAll( "" ); /* 过滤空格回车标签 */
/* 去掉<p>标签<br></br>标签和<>之间内容 */
htmlStr.replaceAll( "<p .*?>", "\r\n" );
htmlStr.replaceAll( "<br\\s*/?>", "\r\n" );
htmlStr.replaceAll( "\\<.*?>", "" );
return(htmlStr.trim() ); /* 返回文本字符串 */
}
}