在开发中有时候会遇到在一大串字符串中替换或者去除某个特定的字符串,一下例子是过滤html页面字符串的实例,说明正则在其中的作用:
package com.project.admin.common.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* html标签
* @author xwp
*
*/
public class HtmlUtil {
//去除某种视频引用方法,然后替换成新的
private static final String pc_embed_first="http://player.youku.com/player.php/sid/";
private static final String pc_embed_end="==/isShowRelatedVideo";
private static final String embed_tag="<embed[^>]*?[\\s\\S]*? \\/>";
private static final String smallVideo="<div id='youkuplayer' style='width:6.4rem;height:3.2rem'></div><script type='text/javascript' src='http://player.youku.com/jsapi'> </script><script type='text/javascript'>player = new YKU.Player('youkuplayer',{styleid: '0',client_id: 'ec5fe2a0dce21ad3',vid: '###videoAddress###',newPlayer: true}); </script>";
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
private static final String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符
//去除行样式
private static final String remove_style ="style=\"[^>]*?;\""; // 定义style的正则表达式
private static final String remove_width ="width=\"[^>]*?\""; // 定义style的正则表达式
private static final String remove_height ="height=\"[^>]*?\""; // 定义style的正则表达式
/**
* @param htmlStr
* @return
* 删除Html标签
*/
public static String delHTMLTag(String htmlStr) {
Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
Matcher m_space = p_space.matcher(htmlStr);
htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
return htmlStr.trim(); // 返回文本字符串
}
//删除Html标签
public static String getTextFromHtml(String htmlStr){
htmlStr = delHTMLTag(htmlStr);
htmlStr = htmlStr.replaceAll(" ", "");
return htmlStr;
}
//去除style行样式
public static String removeStyleHtml(String htmlStr){
Pattern p_style = Pattern.compile(remove_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
Pattern p_width = Pattern.compile(remove_width, Pattern.CASE_INSENSITIVE);
Matcher m_width = p_width.matcher(htmlStr);
htmlStr = m_width.replaceAll(""); // 过滤style标签
Pattern p_height = Pattern.compile(remove_height, Pattern.CASE_INSENSITIVE);
Matcher m_height = p_height.matcher(htmlStr);
htmlStr = m_height.replaceAll(""); // 过滤style标签
//System.out.println(htmlStr);
return htmlStr;
}
public static List<String> getVideoId(String htmlStr){ //获取vid
List<String> results = new ArrayList<String>();
Pattern p=Pattern.compile(pc_embed_first+"(.*?)"+pc_embed_end);
Matcher m=p.matcher(htmlStr);
while(!m.hitEnd() && m.find()){
results.add(m.group(1));
}
return results;
}
public static List<String> getEmbedTag(String htmlStr){ //获取embed标签
List<String> results = new ArrayList<String>();
Pattern pp = Pattern.compile(embed_tag);
Matcher mp = pp.matcher(htmlStr);
while(!mp.hitEnd() && mp.find()){
results.add(mp.group(0));
}
return results;
}
//处理带有视频的新闻内容,,供微官网用
public static String doSmallVideo(String htmlStr){
if(htmlStr.contains(pc_embed_first)){
List<String> videoIds=getVideoId(htmlStr);
List<String> embedTags=getEmbedTag(htmlStr);
if(!videoIds.isEmpty() && !embedTags.isEmpty() && videoIds.size()==embedTags.size()){
for (int i=0;i<embedTags.size();i++){
String tempTag=smallVideo;
tempTag=tempTag.replace("###videoAddress###",videoIds.get(i));
htmlStr=htmlStr.replaceAll(embedTags.get(i),tempTag);
}
}
}
return htmlStr;
}