Java的正则引擎(java.util.regex)做不到未知层次递归匹配,再说这个用正则效率不会高,所以考虑其他方法实现。对于.Net平台,可尝试一下这个正则:^[^<>]*(?:<(?!/)(?<o>[^/s<>]+)(?:/s[^<>]*)?(?:(?<-o>/>)|>)[^<>]*|(?<-o><//k<o>>[^<>]*))*(?(o)(?!))$ (若闭合返回true,否则返回false。不会忽略单双引号之间的内容,标签也不可以交叉)
如果要求严格可用xml方式验证,如<div id="test"name="test2"></div>(属性间没空格)等是不能通过的。如果不需要这么严格则可试一下下面的方法(忽略单双引号之间的内容,标签可以交叉):
若用ArrayList存放提取出来的标签效率不佳,就自己写一个TagsList,如下:
**
* 存放提取出来的标签
* @author Liw
* @time 2007-6
*/
import java.util.Arrays;
class TagsList
{
private String[] data;
private int size = 0;
public TagsList(int size)
{
data = new String[size];
}
public TagsList()
{
this(10);
}
public void add(String str)
{
ensureCapacity(size + 1);
data[size++] = str;
}
public String get(int index)
{
if (index < size)
return data[index];
else
return null;
}
//为了提高效率,只将其置为null
public boolean remove(String str)
{
for (int index = size - 1; index >= 0; index--) {
if (str.equals(data[index])) {
data[index] = null;
return true;
}
}
return false;
}
public boolean remove(int index)
{
if (index < data.length) {
data[index] = null;
return true;
}
return false;
}
public int size()
{
return this.size;
}
//扩展容量
public void ensureCapacity(int minSize)
{
int oldCapacity = data.length;
if (minSize > oldCapacity) {
int newCapacity = (oldCapacity * 3 / 2 + 1) > minSize ?
oldCapacity * 3 / 2 + 1 : minSize;
data = (String[]) Arrays.copyOf(data, newCapacity);
}
}
}
下面的类提供了两个静态方法,用于检查&修复HTML标签 :
/**
* 检查文本中的HTML标签是否闭合,并提供简单的修复功能
*
* @author Liw
* @time 2007-6
*/
public class TagsChecker
{
public static boolean check(String str)
{
TagsList[] unclosedTags = getUnclosedTags(str);
if (unclosedTags[0].size() != 0) {
return false;
}
for (int i = 0; i < unclosedTags[1].size(); i++) {
if (unclosedTags[1].get(i) != null)
return false;
}
return true;
}
public static String fix(String str)
{
StringBuilder fixed = new StringBuilder(); // 存放修复后的字符串
TagsList[] unclosedTags = getUnclosedTags(str);
// 生成新字符串
for (int i = unclosedTags[0].size() - 1; i > -1; i--) {
fixed.append("<" + unclosedTags[0].get(i) + ">");
}
fixed.append(str);
for (int i = unclosedTags[1].size() - 1; i > -1; i--) {
String s = null;
if ((s = unclosedTags[1].get(i)) != null) {
fixed.append("</" + s + ">");
}
}
return fixed.toString();
}
private static TagsList[] getUnclosedTags(String str)
{
StringBuilder temp = new StringBuilder(); // 存放标签
TagsList[] unclosedTags = new TagsList[2];
unclosedTags[0] = new TagsList(); // 前不闭合,如有</div>而前面没有<div>
unclosedTags[1] = new TagsList(); // 后不闭合,如有<div>而后面没有</div>
boolean flag = false; // 记录双引号"或单引号'
char currentJump = ' '; // 记录需要跳过'...'还是"..."
char current = ' ', last = ' '; // 当前 & 上一个
// 开始判断
for (int i = 0; i < str.length();) {
current = str.charAt(i++); // 读取一个字符
if (current == '"' || current == ''') {
flag = flag ? false : true; // 若为引号,flag翻转
currentJump = current;
if (flag) {
while (i < str.length() && str.charAt(i++) != currentJump)
; // 跳过引号之间的部分
flag = false;
}
}
else if (current == '<') { // 开始提取标签
current = str.charAt(i++);
if (current == '/') { // 标签的闭合部分,如</div>
current = str.charAt(i++);
// 读取标签
while (i < str.length() && current != '>') {
temp.append(current);
current = str.charAt(i++);
}
// 从tags_bottom移除一个闭合的标签
if (!unclosedTags[1].remove(temp.toString())) { // 若移除失败,说明前面没有需要闭合的标签
unclosedTags[0].add(temp.toString()); // 此标签需要前闭合
}
temp.delete(0, temp.length()); // 清空temp
}
else { // 标签的前部分,如<div>
last = current;
while (i < str.length() && current != ' '
&& current != ' ' && current != '>') {
temp.append(current);
last = current;
current = str.charAt(i++);
}
// 已经读取到标签,跳过其他内容,如<div id=test>跳过id=test
while (i < str.length() && current != '>') {
last = current;
current = str.charAt(i++);
if (current == '"' || current == ''') { // 判断双引号
flag = flag ? false : true;
currentJump = current;
if (flag) { // 若引号不闭合,跳过到下一个引号之间的内容
while (i < str.length()
&& str.charAt(i++) != currentJump)
;
current = str.charAt(i++);
flag = false;
}
}
}
if (last != '/' && current == '>') // 判断这种类型:<TagName />
unclosedTags[1].add(temp.toString());
temp.delete(0, temp.length());
}
}
}
return unclosedTags;
}
}
进行了一些测试:
public class Test
{
public static void main(String[] args)
{
System.out.println("--功能测试--");
String str1 = "tt</u>ss</a>aa<div name="<test>" id='3' other='<test>'><b>sff";
String str2 = "tt<u>ss</u><div id=test name="<test>"><a>fds</a></div>";
System.out.println("检查文本 " + str1);
System.out.println("结果:" + TagsChecker.check(str1));
System.out.println("检查文本 " + str2);
System.out.println("结果:" + TagsChecker.check(str2));
System.out.println("修复文本 " + str1);
System.out.println("结果:" + TagsChecker.fix(str1));
for (int i = 0; i < 10; i++) {
str1 += str1;
}
System.out.println();
System.out.println("--效率测试--");
System.out.println("文本长度:" + str1.length());
long t1 = System.currentTimeMillis();
boolean closed = TagsChecker.check(str1);
long t2 = System.currentTimeMillis();
String fixedStr = TagsChecker.fix(str1);
long t3 = System.currentTimeMillis();
System.out.println("检查用时:" + (t2 - t1) + " 毫秒 结果:" + closed);
System.out.println("修复用时:" + (t3 - t2) + " 毫秒");
}
}
public class Test
{
public static void main(String[] args)
{
System.out.println("--功能测试--");
String str1 = "tt</u>ss</a>aa<div name="<test>" id='3' other='<test>'><b>sff";
String str2 = "tt<u>ss</u><div id=test name="<test>"><a>fds</a></div>";
System.out.println("检查文本 " + str1);
System.out.println("结果:" + TagsChecker.check(str1));
System.out.println("检查文本 " + str2);
System.out.println("结果:" + TagsChecker.check(str2));
System.out.println("修复文本 " + str1);
System.out.println("结果:" + TagsChecker.fix(str1));
for (int i = 0; i < 10; i++) {
str1 += str1;
}
System.out.println();
System.out.println("--效率测试--");
System.out.println("文本长度:" + str1.length());
long t1 = System.currentTimeMillis();
boolean closed = TagsChecker.check(str1);
long t2 = System.currentTimeMillis();
String fixedStr = TagsChecker.fix(str1);
long t3 = System.currentTimeMillis();
System.out.println("检查用时:" + (t2 - t1) + " 毫秒 结果:" + closed);
System.out.println("修复用时:" + (t3 - t2) + " 毫秒");
}
}
---------------------
作者:crazygou
来源:CSDN
原文:https://blog.csdn.net/CrazyGou/article/details/1643094