import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegExpUtil {
/**
* 过滤html标签
* @param s - html字符串
* @param tag - a,p,img,div,(不区分大小写)
* @param contain - 是否过滤掉标签内包含的内容
* @return
*/
public static String filterTag(String s, String tag, boolean contain) {
String regexp = null;
if (contain) {
//懒惰匹配的加问号,如*?,最小匹配
regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>";
} else {
regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>";
}
Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, "");
//System.out.println(matcher.group());
}
matcher.appendTail(sb);
return sb.toString();
}
/**
*
* 过滤html标签组
* @param s - html字符串
* @param tags - a,p,img,div,(不区分大小写)
* @param contain - 是否过滤掉标签内包含的内容
* @return
*/
public static String filterTags(String s, String[] tags, boolean contain) {
String ss = s;
for (String tag : tags) {
ss = filterTag(ss, tag, contain);
}
return ss;
}
/**
* 过滤html中的注释和标签
* @Date: 2013-6-24下午12:10:29
* @Description: String
* @param html
* @return
*/
public static String filterHtmlTag(String html){
String regexp = "<[\\s*\\S*]([^>]*)>|</[\\s*\\S*]>";
Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(html);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, "");
}
matcher.appendTail(sb);
return sb.toString();
}
/**
* 先过滤掉head,script,style(包括标签内的内容),然后过滤所有的html标签(不包括内容)
* @Date: 2013-6-24下午12:13:07
* @Description: String
* @param text
* @return
*/
public static String filterNoise(String text){
String[]tags = {"head","script","style"};
text = filterTags(text, tags, true);
text = filterHtmlTag(text);
text = text.replaceAll("\\s+", "");
return text;
}
/**
* 过滤掉指定的html标签
* @Date: 2013-6-13上午10:21:01
* @Description: void
* @param text
*/
public static String removeHTML(String text){
String[] tags = { "a", "img", "p", "div", "center"};
text = filterTags(text, tags, false);
String[] tags2 = {"object","param","script","iframe" };
text = filterTags(text, tags2, true);
text = text.replaceAll("\\r*\\n*", "");
text = text.replaceAll(">\\s*<", "><");
return text;
}
public static boolean match(String text, String regex) {
if ((text != null) && (text.trim().length() > 0)) {
String t = text.replaceAll(regex, "");
if (t.length() == 0) {
return true;
}
return false;
}
return false;
}
public static boolean isChinese(String text) {
String regex = "([\u4E00-\u9FA5])";
return match(text, regex);
}
public static boolean isEnglish(String text) {
String regex = "([a-zA-Z])";
return match(text, regex);
}
public static boolean isDigit(String text) {
String regex = "([0-9])";
return match(text, regex);
}
public static String getChinese(String s) {
String regexp = "([\u4E00-\u9FA5])";
Pattern pattern = Pattern.compile(regexp, 2);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
sb.append(matcher.group());
}
return sb.toString();
}
public static String getEnglish(String s) {
String regexp = "[a-zA-Z]";
Pattern pattern = Pattern.compile(regexp, 2);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
sb.append(matcher.group());
}
return sb.toString();
}
public static String getEnglishPDF(String s) {
String regexp = "[a-zA-Z\\s-&]";
Pattern pattern = Pattern.compile(regexp, 2);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
sb.append(matcher.group());
}
return sb.toString();
}
public static String getDigit(String s) {
String regexp = "[0-9]";
Pattern pattern = Pattern.compile(regexp, 2);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
sb.append(matcher.group());
}
return sb.toString();
}
//过滤非字母数字汉字
public static String filterNotAlphaDigitChinese(String text) {
return text.replaceAll("[\\pP\\pZ\\pS]", "");
}
public static String filterPrefix(String text, String regex) {
if ((text == null) || (text.trim().length() == 0)) {
return null;
}
char[] cs = text.toCharArray();
int index = 0;
for (int i = 0; i < cs.length; i++) {
String c = cs[i] + "";
boolean bool = c.matches(regex);
if (!bool) {
index = i;
break;
}
}
String text_final = text.substring(index, text.length());
return text_final;
}
public static String add(String text,String regex,String prefix,String suffix){
// String regex = "[\u4E00-\u9FA5]{2,20}[\\s]{1,10}[a-zA-Z\\s&]{1,50}";
// "</content>\n<title>", "</title>\n<content>"
//pattern
Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
String match_text = matcher.group();
// match_text = match_text.replaceAll("\r", "");
String replace_text = prefix+match_text+suffix;
// replace_text = replace_text.replaceAll("\n\r", "");
matcher.appendReplacement(sb, replace_text);
// sb.append(matcher.group());
}
matcher.appendTail(sb);
// System.out.println(sb.toString());
return sb.toString();
}
/**
* 是否是数字和符号的组合
* @param text
* @return
*/
public static boolean isDigitSymbol(String text){
String regex = "[\\pP\\pS\\pN\\pZ]";
// System.out.println(text.matches(regex));
String t = text.replaceAll(regex, "");
if("".equals(t)){
return true;
}else{
return false;
}
}
/**
* 是否包含中文或者英文
* @param text
* @return
*/
public static boolean isChineseEnglish(String text){
String regex = "\\pL";
Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
// matcher.appendReplacement(sb, "");
// System.out.println(matcher.group());
return true;
}
return false;
}
public static List<String> getTagHtml(String s, String tag) {
//懒惰匹配的加问号,如*?,最小匹配
String regexp = "<[\\s]*?"+tag+"[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?"+tag+"[\\s]*?>";
// regexp = "<\\s*" + tag + "([^>]*)>|</\\s*" + tag + "\\s*>";
Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(s);
List<String> result = new ArrayList<String>();
while (matcher.find()) {
String e = matcher.group();
// System.out.println(matcher.toString());
result.add(e);
//System.out.println(matcher.group());
}
return result;
}
/**
* 过滤所有空格
* @param text
* @return
*/
public static String filterSpace(String text){
String regex_32 = (char)32+"";
text = text.replaceAll(regex_32, "");
for (int i = 128; i < 161; i++) {
String regex = (char)i+"";
text = text.replaceAll(regex, "");
}
return text;
}
/**
* 过滤注释标签<!开头的
* @param s
* @param tag
* @param contain
* @return
*/
public static String filterAnnotation(String s) {
//懒惰匹配的加问号,如*?,最小匹配
String regexp = "<![^>]*>";
Pattern pattern = Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(s);
StringBuffer sb = new StringBuffer();
while (matcher.find()) {
matcher.appendReplacement(sb, "");
//System.out.println(matcher.group());
}
matcher.appendTail(sb);
return sb.toString();
}
public static void main(String[] args) {
}
}