使用正则表达式处理html标签方案分享

首先呢,阅读本文前建议大家去读下这篇文章点击打开链接,是关于java正则工具类Matcher相关的一些探讨和建议

下面这是其中一项关于处理匹配内容替换的范例,

appendReplacement() + appendTail()组合


import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class MatcherReplaceExample {

    public static void main(String[] args) {

        String text    =
                  "John writes about this, and John Doe writes about that," +
                          " and John Wayne writes about everything."
                ;

        String patternString1 = "((John) (.+?)) ";

        Pattern      pattern      = Pattern.compile(patternString1);
        Matcher      matcher      = pattern.matcher(text);
        StringBuffer stringBuffer = new StringBuffer();

        while(matcher.find()){
            matcher.appendReplacement(stringBuffer, "Joe Blocks ");
            System.out.println(stringBuffer.toString());
        }
        matcher.appendTail(stringBuffer);

        System.out.println(stringBuffer.toString());
    }
}



在这里和大家分享本人写的一个工具类,使用正则表达式处理一些常用html标签,这种标签其实可以定制,只需找出与其匹配的正则表达式,然后按照推荐处理方案去处理即可

package com.jieve.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @ClassName: HtmlRegexpUtil
 * @Description: 处理html标签
 * @author YYYong
 * @date 2016年5月18日 上午9:26:28
 * 
 */
public class HtmlRegexpUtil
{

	private final static String regxpForHtml = "<([^>]*)>"; // 过滤所有以<开头以>结尾的标签

	private final static String rexImgTag = "<(img.*?)(class.*?)(src.*?)>";   // 找出IMG标签

	private final static String regExpTag = "<(img.*?)(src.*?)(alt.*?)(title.*?)(height.*?)>"; // 找出表情标签

	private final static String regATag = "(<(a.*?href=.*?)>(.*?)</a>)"; // 找出<a>标签
	
	public final static String regSpanTag = "<(span.*?)(style.*?)>(.*?)</span>"; // 找出<span></span>标签
	
	public final static String regStrongTag = "(<(a.*?href=.*?)>(.*?)</a>)"; // 找出<strong></strong>标签
	
	private final static String spanTagPre = "<span style=\"text-decoration:underline;\">"; // 找出<span>标签
	
	private final static String spanTagSuf = "</span>"; // 找出<span>标签
	
	private final static String strongTagPre = "<strong>"; // 找出<strong>标签
	
	private final static String strongTagSuf = "</strong>"; // 找出<strong>标签

	/**
	 * @Title: handleImgTag
	 * @Description: 处理IMG标签
	 * @param @param str
	 * @param @param prefix
	 * @param @param suffix
	 * @param @return 设定文件
	 * @return String 返回类型
	 * @throws
	 */
	public static String handleImgTag(String str, String prefix, String suffix)
	{

		Pattern pattern = Pattern.compile(rexImgTag);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result = matcher.find();
		while (result)
		{
			String temp = matcher.group(3);
			temp = temp.substring(6);
			matcher.appendReplacement(sb, prefix + temp.substring(0, temp.length() - 1) + suffix);
			result = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	 * @Title: handleExpTag
	 * @Description: 处理表情标签
	 * @param @param str
	 * @param @param prefix
	 * @param @param suffix
	 * @param @return 设定文件
	 * @return String 返回类型
	 * @throws
	 */
	public static String handleExpTag(String str, String prefix, String suffix)
	{

		Pattern pattern = Pattern.compile(regExpTag);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result = matcher.find();
		while (result)
		{
			String temp = matcher.group(4);
			temp = temp.substring(7, temp.length() - 2);
			matcher.appendReplacement(sb, prefix + temp + suffix);
			result = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	* @Title: handleATag
	* @Description: 处理<a>标签
	* @param @param str
	* @param @param prefix
	* @param @param suffix
	* @param @return    设定文件
	* @return String    返回类型
	* @throws
	*/
	public static String handleATag(String str, String prefix, String suffix)
	{

		Pattern pattern = Pattern.compile(regATag);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result = matcher.find();
		while (result)
		{
			String temp = matcher.group(3);
			if(temp.startsWith("$"))
			{
				// $符号为正则特殊符号,需替换后再处理
				temp = "[@@]" + temp.substring(1, temp.length() - 1) + "[@@]";
			}
			matcher.appendReplacement(sb, prefix + temp + suffix);
			result = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString().replace("[@@]", "$");
	}

	/**
	* @Title: removeTag
	* @Description: 移除标签
	* @param @param str
	* @param @param tag
	* @param @return    设定文件
	* @return String    返回类型
	* @throws
	*/
	public static String removeTag(String str,String tag)
	{

		int count = 2;
		if(regSpanTag.equals(tag))
		{
			count++;
		}
		Pattern pattern = Pattern.compile(tag);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result = matcher.find();
		while (result)
		{
			String temp = matcher.group(count);
			matcher.appendReplacement(sb,temp);
			result = matcher.find();
		}
		matcher.appendTail(sb);
		if(count == 2)
		{
			
			return sb.toString().replace(strongTagPre, "").replace(strongTagSuf, "");
		}
		else
		{

			return sb.toString().replace(spanTagPre, "").replace(spanTagSuf, "");
		}
		
	}
	
	/**
	 * @Title: replaceTag
	 * @Description: 替换标记以正常显示
	 * @param @param input
	 * @param @return 设定文件
	 * @return String 返回类型
	 * @throws
	 */
	public String replaceTag(String input)
	{
		if(!hasSpecialChars(input))
		{
			return input;
		}
		StringBuffer filtered = new StringBuffer(input.length());
		char c;
		for (int i = 0; i <= input.length() - 1; i++)
		{
			c = input.charAt(i);
			switch (c)
			{
				case '<':
					filtered.append("<");
					break;
				case '>':
					filtered.append(">");
					break;
				case '"':
					filtered.append(""");
					break;
				case '&':
					filtered.append("&");
					break;
				default:
					filtered.append(c);
			}

		}
		return (filtered.toString());
	}

	/**
	 * @Title: hasSpecialChars
	 * @Description: 判断标记是否存在
	 * @param @param input
	 * @param @return 设定文件
	 * @return boolean 返回类型
	 * @throws
	 */
	public boolean hasSpecialChars(String input)
	{
		boolean flag = false;
		if((input != null) && (input.length() > 0))
		{
			char c;
			for (int i = 0; i <= input.length() - 1; i++)
			{
				c = input.charAt(i);
				switch (c)
				{
					case '>':
						flag = true;
						break;
					case '<':
						flag = true;
						break;
					case '"':
						flag = true;
						break;
					case '&':
						flag = true;
						break;
				}
			}
		}
		return flag;
	}

	/**
	 * @Title: filterHtml
	 * @Description: 基本功能:过滤所有以"<"开头以">"结尾的标签
	 * @param @param str
	 * @param @return 设定文件
	 * @return String 返回类型
	 * @throws
	 */
	public static String filterHtml(String str)
	{
		Pattern pattern = Pattern.compile(regxpForHtml);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result1 = matcher.find();
		while (result1)
		{
			matcher.appendReplacement(sb, "");
			result1 = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

	/**
	 * @Title: fiterHtmlTag
	 * @Description: 过滤指定标签
	 * @param @param str
	 * @param @param tag
	 * @param @return 设定文件
	 * @return String 返回类型
	 * @throws
	 */
	public static String fiterHtmlTag(String str, String tag)
	{
		String regxp = "<\\s*" + tag + "\\s+([^>]*)\\s*>";
		Pattern pattern = Pattern.compile(regxp);
		Matcher matcher = pattern.matcher(str);
		StringBuffer sb = new StringBuffer();
		boolean result1 = matcher.find();
		while (result1)
		{
			matcher.appendReplacement(sb, "");
			result1 = matcher.find();
		}
		matcher.appendTail(sb);
		return sb.toString();
	}

}

测试代码:

String content = "测试测试测试测试啦啦啦啦啦啦啦啊\n测试啦啦啦啦啦啦啦啦啦阿里啦\n<a href=\"/u/9\">@梦小胖</a> \n@萌二胖 \n<a href=\"/p/2317\">$贵州茅台(SH600519)$</a> \n$的上次你说的承诺(12580)$ \n<img class=\"img\" src=\"//192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg\">\n<span style=\"text-decoration:underline;\">xiaoming 下划线鲜花县</span>\n<span style=\"text-decoration:underline;\">\n</span>\n<span style=\"text-decoration:underline;\"><strong>下划线下划线加粗</strong>\n</span>";
		System.out.println("The original content : " + content);
		content = HtmlRegexpUtil.handleImgTag(content, "{{", "}}");
		System.out.println("The 1st   content : " + content);
		content = HtmlRegexpUtil.handleExpTag(content, "", "");
		System.out.println("The 2ed   content : " + content);
		content = HtmlRegexpUtil.handleATag(content, "<", ">");
		System.out.println("The 3rd   content : " + content);
		content = HtmlRegexpUtil.removeTag(content, HtmlRegexpUtil.regSpanTag);
		System.out.println("The 4th   content : " + content);
		content = HtmlRegexpUtil.removeTag(content, HtmlRegexpUtil.regStrongTag);
		System.out.println("The final content : " + content);



结果如下:

The original content : 测试测试测试测试啦啦啦啦啦啦啦啊
测试啦啦啦啦啦啦啦啦啦阿里啦
<a href="/u/9">@梦小胖</a> 
@萌二胖 
<a href="/p/2317">$贵州茅台(SH600519)$</a> 
$的上次你说的承诺(12580)$ 
<img class="img" src="//192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg">
<span style="text-decoration:underline;">xiaoming 下划线鲜花县</span>
<span style="text-decoration:underline;">
</span>
<span style="text-decoration:underline;"><strong>下划线下划线加粗</strong>
</span>
The 1st   content : 测试测试测试测试啦啦啦啦啦啦啦啊测试啦啦啦啦啦啦啦啦啦阿里啦<ahref="/u/9">@梦小胖</a>@萌二胖<ahref="/p/2317">$贵州茅台(SH600519)$</a>$的上次你说的承诺(12580)${{/192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg}}<spanstyle="text-decoration:underline;">xiaoming下划线鲜花县</span><spanstyle="text-decoration:underline;"></span><spanstyle="text-decoration:underline;"><strong>下划线下划线加粗</strong></span>
The 2ed   content : 测试测试测试测试啦啦啦啦啦啦啦啊测试啦啦啦啦啦啦啦啦啦阿里啦<ahref="/u/9">@梦小胖</a>@萌二胖<ahref="/p/2317">$贵州茅台(SH600519)$</a>$的上次你说的承诺(12580)${{/192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg}}<spanstyle="text-decoration:underline;">xiaoming下划线鲜花县</span><spanstyle="text-decoration:underline;"></span><spanstyle="text-decoration:underline;"><strong>下划线下划线加粗</strong></span>
The 3rd   content : 测试测试测试测试啦啦啦啦啦啦啦啊测试啦啦啦啦啦啦啦啦啦阿里啦<@梦小胖>@萌二胖<$贵州茅台(SH600519)$>$的上次你说的承诺(12580)${{/192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg}}<spanstyle="text-decoration:underline;">xiaoming下划线鲜花县</span><spanstyle="text-decoration:underline;"></span><spanstyle="text-decoration:underline;"><strong>下划线下划线加粗</strong></span>
The 4th   content : 测试测试测试测试啦啦啦啦啦啦啦啊测试啦啦啦啦啦啦啦啦啦阿里啦<@梦小胖>@萌二胖<$贵州茅台(SH600519)$>$的上次你说的承诺(12580)${{/192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg}}xiaoming下划线鲜花县<strong>下划线下划线加粗</strong>
The final content : 测试测试测试测试啦啦啦啦啦啦啦啊测试啦啦啦啦啦啦啦啦啦阿里啦<@梦小胖>@萌二胖<$贵州茅台(SH600519)$>$的上次你说的承诺(12580)${{/192.168.1.20//static/img/picture/835/83592167b2d54d9496be896b94d1998b_originThu.jpg}}xiaoming下划线鲜花县下划线下划线加粗


另外开源库Jsoup处理标签也很强大,有兴趣的同学可以学习学习

作者声明:有问题请联系794465942@qq.com


©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页