<pre name="code" class="java">public class HtmlCleaner {
private static Pattern p_script;
private static java.util.regex.Matcher m_script;
private static Pattern p_style;
private static java.util.regex.Matcher m_style;
private static Pattern p_html;
private static java.util.regex.Matcher m_html;
private static Pattern p_a;
private static java.util.regex.Matcher m_a;
private static String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>
// }
private static String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>
// }
private static String regEx_a = "<\\s*a\\s+([^>]*)\\s*>"; //过滤a的正则表达式
private static String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
/**
* Html2Text is used to clean the HTML tags in a String which is very useful
* in the article component and so on
*
* @param inputString
* the String which is needed to clean
* @return String,the result of cleaning operation
*/
public static String filterHtmlToA(String inputString){
String htmlStr = inputString; // 含html标签的字符串
String textStr = "";
try {
p_a = Pattern.compile(regEx_a,Pattern.CASE_INSENSITIVE);
m_a = p_a.matcher(htmlStr);
htmlStr = m_a.replaceAll(""); //过滤a标签
textStr = htmlStr;
}catch (Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;// 返回文本字符串
}
public static String filterHtmlTag(String inputString) {
String htmlStr = inputString; // 含html标签的字符串
String textStr = "";
try {
p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
textStr = htmlStr;
} catch (Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;// 返回文本字符串
}
文本标签过滤
最新推荐文章于 2023-02-01 09:34:10 发布