处理文章截取时对html的处理

最新推荐文章于 2019-10-17 10:55:37 发布

Robin-LV

最新推荐文章于 2019-10-17 10:55:37 发布

阅读量118

点赞数

文章标签： HTML 正则表达式 JSP Servlet log4j

本文链接：https://blog.csdn.net/robin_lvxiongbin/article/details/83735098

版权

package people.util;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.jsp.JspException;
import javax.servlet.jsp.JspWriter;
import javax.servlet.jsp.tagext.TagSupport;

import org.apache.log4j.Logger;

public class CutHtml extends TagSupport {

private static final long serialVersionUID = 1L;
protected final Logger log = Logger.getLogger(getClass());

private static String htmlMatch = "";
private String param;
private int length;
private String endWith;

public int doStartTag() throws JspException {
JspWriter out = pageContext.getOut();
String str = null;
try {
str = subStringHTML(param, length, endWith);
str=str.replaceAll("<img", "<img οnlοad=\"javascript:DrawImage(this,200,150)\"");
str=str.replaceAll("<IMG", "<IMG οnlοad=\"javascript:DrawImage(this,200,150)\"");
out.print(str);
} catch (Exception e) {
log.error("", e);
}
return super.doStartTag();
}

// 通过递归删除html文件中的配对的html标签

public static String removeMatchHtmlTag() {

Pattern p = Pattern.compile("<([a-zA-Z]+)[^<>]*>(.*?)</\\1>");
Matcher m = p.matcher(htmlMatch);

if (m.find()) {

htmlMatch = htmlMatch.replaceAll("<([a-zA-Z]+)[^<>]*>(.*?)</\\1>", "$2");

removeMatchHtmlTag();
}

return htmlMatch;
}

public static String subStringHTML(String par, int len, String end) {

if (len < 1) {

return null;
}

if (par.length() < len) {
return par;
}

StringBuffer result = new StringBuffer();
StringBuffer str = new StringBuffer();
int n = 0;

char temp;

boolean isCode = false; // 是不是HTML代码
boolean isHTML = false; // 是不是HTML特殊字符,如
for (int i = 0; i < par.length(); i++) {
temp = par.charAt(i);
if (temp == '<') {
isCode = true;
} else if (temp == '&') {
isHTML = true;
} else if (temp == '>' && isCode) {
n = n - 1;
isCode = false;
} else if (temp == ';' && isHTML) {
isHTML = false;
}
if (!isCode && !isHTML) {
n = n + 1;
if ((temp + "").getBytes().length > 1) {
n = n + 1;
}
str.append(temp);
}
result.append(temp);
if (n >= len) {
break;
}
}

result.append(end);
// 取出截取字符串中的HTML标记
String temp_result = result.toString().replaceAll("(>)[^<>]*(<?)", "$1$2");

// 去掉不需要结束标记的HTML标记

temp_result = temp_result
.replaceAll(
"<(AREA|BASE|BASEFONT|BODY|BR|COL|COLGROUP|DD|DT|FRAME|HEAD|HR|HTML|IMG|INPUT|ISINDEX|LI|LINK|META|OPTION|P|PARAM|TBODY|TD|TFOOT|TH|THEAD|TR|area|base|basefont|body|br|col|colgroup|dd|dt|frame|head|hr|html|img|input|isindex|li|link|meta|option|p|param|tbody|td|tfoot|th|thead|tr)[^<>]*/>",
"");

// 去掉成对的HTML标记
htmlMatch = temp_result;
temp_result = removeMatchHtmlTag();

// 用正则表达式取出标记

Pattern p = Pattern.compile("<([a-zA-Z]+)[^<>]*>");
Matcher m = p.matcher(temp_result);
List<String> endHTML = new ArrayList<String>();

while (m.find()) {
endHTML.add(m.group(1));
}

// 补全不成对的HTML标记
for (int i = endHTML.size() - 1; i >= 0; i--) {
result.append("</");
result.append(endHTML.get(i));
result.append(">");
}
return result.toString();

}

public String getParam() {
return param;
}

public void setParam(String param) {
this.param = param;
}

public int getLength() {
return length;
}

public void setLength(int length) {
this.length = length;
}

public String getEndWith() {
return endWith;
}

public void setEndWith(String endWith) {
this.endWith = endWith;
}

}

Robin-LV

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
处理文章截取时对html的处理

package people.util;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import javax.servlet.jsp.JspException;import javax.se...
复制链接

扫一扫