项目中的大牛写的html文本过滤函数
/**
* 把html内容转为文本
* @param html 需要处理的html文本
* @param filterTags 需要保留的html标签样式
* @return
*/
public static String trimHtml2Txt(String html, String[] filterTags){
html = html.replaceAll("\\
[\\s\\S]*?(?i)", "");//去掉headhtml = html.replaceAll("\\", "");//去掉注释
html = html.replaceAll("\\", "");
html = html.replaceAll("\\(?i)", "");//去掉样式
html = html.replaceAll("\\(?i)", "");//去掉js
html = html.replaceAll("\\]+>[\\s\\S]*?]+>(?i)", "");//去掉word标签
html = html.replaceAll("\\[\\s\\S]*?(?i)", "");
html = html.replaceAll("\\]*>|
]*>||(?i)", "");html = html.replaceAll("\\\r\n|\n|\r", " ");//去掉换行
html = html.replaceAll("\\
]*>(?i)", "\n");
List tags = new ArrayList();
List s_tags = new ArrayList();
List halfTag = Arrays.asList(new String[]{"img","table","thead","th","tr","td"});//
if(filterTags != null && filterTags.length > 0){
for (String tag : filterTags) {
tags.add(""));//开始标签
if(!"img".equals(tag)) tags.add(""+tag+">");//结束标签
s_tags.add("#REPLACETAG"+tag+(halfTag.contains(tag)?"":"REPLACETAG#"));//尽量替换为复杂一点的标记,以免与显示文本混合,如:文本中包含#td、#table等
if(!"img".equals(tag)) s_tags.add("#REPLACETAG/"+tag+"REPLACETAG#");
}
}
html = ExStringUtils.replaceEach(html, tags.toArray(new String[tags.size()]), s_tags.toArray(new String[s_tags.size()]));
html = html.replaceAll("\\
(?i)", "\n");html = html.replaceAll("\\]+>", "");
html = ExStringUtils.replaceEach(html,s_tags.toArray(new String[s_tags.size()]),tags.toArray(new String[tags.size()]));
html = html.replaceAll("\\ ", " ");
return html.trim();
}