Java解析HTML标签

#基于状态机思想实现解析HTML TAG ,后续有时间的话尝试写一个JSON解析工具。

import java.util.HashMap;
import java.util.Map;

public class TagParser {
	private Integer index = 0;
	private char[] tagChar;
	private int position = 0;
	/* 解析符号 */
	private char symbol = '"';
	public static final String START_SCRIPT = "<";
	public static final String END_SCRIPT = ">";
	public static final String END_SCRIPT_1 = "/>";
	public static final String EQ = "=";
	public static final char SPACE = ' ';
	public static final String MUST_SPACE = " ";
	public static final String TAB = "	";
	public static final String SYMBOL = "'";
	private DFAStatus status;
	private Map<Integer, Entity> map = new HashMap<Integer, Entity>();
	private Map<String, String> result = new HashMap<String, String>();
	private String tagName;

	public String getTagName() {
		return this.tagName;
	}

	public static void main(String[] args) throws SymbolError {
		String tag = "<script  filter=\"a\'\" type=\"text/javascript\" id='node' src=\"http://www.test.com/abc.js\" async=\"true\" />";
		int i = 0;
		long start = System.currentTimeMillis();
		while (i < 2) {
			TagParser token = new TagParser(tag);
			token.parser();
			System.out.println(token.getAttr("src") + "|" + token.getTagName());
			i++;
		}
		System.out.println("use time:" + (System.currentTimeMillis() - start));
	}

	private boolean startsWith(String str) {
		char[] chat = str.toCharArray();
		if (position + chat.length > tagChar.length) {
			return false;
		}
		for (int i = 0; i < chat.length; i++) {
			if (tagChar[position + i] != chat[i]) {
				if (is_az(chat[i])) {
					if (tagChar[position + i] == chat[i] - 32) {
						continue;
					}
				}
				return false;
			}
		}
		return true;
	}

	public void parser() throws SymbolError {
		if (status == null) {
			status = DFAStatus.UNSTART;
			skipSpace();
		}
		if (status == DFAStatus.UNSTART) {
			if (startsWith(START_SCRIPT)) {
				position += START_SCRIPT.length();
				status = DFAStatus.TAG_NAME;
				parser();
			} else {
				throw new SymbolError("语法错误:" + tagChar[position]);
			}
		} else if (status == DFAStatus.TAG_NAME) {
			parserTagName();
			status = DFAStatus.START;
			parser();
		} else if (status == DFAStatus.START) {
			nextSpace();
			parser();
		} else if (status == DFAStatus.NULL) {
			skipSpace();
			if (startsWith(END_SCRIPT_1)) {
				status = DFAStatus.DONE;
				done();
				return;
			} else if (startsWith(END_SCRIPT)) {
				status = DFAStatus.DONE;
				done();
				return;
			}
			parserName();
			parser();
		} else if (status == DFAStatus.EQ) {
			parserVal();
			parser();
		} else if (status == DFAStatus.SYMBOL_END) {
			status = DFAStatus.NULL;
			parser();
		}
	}

	private void parserTagName() throws SymbolError {
		StringBuilder builder = new StringBuilder(20);
		for (int i = position; i < tagChar.length; i++) {
			if (is_az(tagChar[i]) || is_AZ(tagChar[i])) {
				builder.append(tagChar[i]);
				position++;
			} else if (builder.length() > 0 && tagChar[i] == ' ') {
				this.tagName = builder.toString();
				return;
			} else {
				break;
			}
		}
		throw new SymbolError("语法错误:" + tagChar[position]);
	}

	private void done() {
		for (Entity entity : map.values()) {
			result.put(entity.name, entity.value);
		}
	}

	private void parserVal() throws SymbolError {
		skipSpace();
		StringBuilder builder = new StringBuilder();
		int startIndex = position;
		for (int i = position; i < tagChar.length; i++) {
			if (i == startIndex) {
				if (tagChar[i] == '\'' || tagChar[i] == '"') {
					symbol = tagChar[i];
					status = DFAStatus.SYMBOL_START;
					position++;
				} else {
					throw new SymbolError("语法错误:" + tagChar[position]);
				}
			} else {
				if (tagChar[i] == symbol) {
					status = DFAStatus.SYMBOL_END;
					position++;
					break;
				} else {
					builder.append(tagChar[i]);
					position++;
				}
			}
		}
		map.get(index).value = builder.toString();
		index++;
	}

	private boolean is_AZ(char chat) {
		return chat >= 65 && chat <= 90;
	}

	private boolean is_az(char chat) {
		return chat >= 97 && chat <= 122;
	}

	private void parserName() throws SymbolError {
		StringBuilder builder = new StringBuilder();
		for (int i = position; i < tagChar.length; i++) {
			if (is_az(tagChar[i]) || is_AZ(tagChar[i])) {
				builder.append(tagChar[i]);
				position++;
			} else {
				if (builder.length() > 0) {
					skipSpace();
					nextEQ();
					break;
				}
			}
		}
		if (builder.toString().length() == 0) {
			throw new SymbolError("语法错误:" + tagChar[position]);
		}
		map.put(index, new Entity(builder.toString()));
	}

	private void nextEQ() throws SymbolError {
		if (startsWith(EQ)) {
			position++;
			status = DFAStatus.EQ;
		} else {
			throw new SymbolError("语法错误:" + tagChar[position]);
		}
	}

	private void skipSpace() {
		for (int i = position; i < tagChar.length; i++) {
			if (tagChar[i] == SPACE || tagChar[i] == '\t') {
				position++;
			} else {
				return;
			}
		}
	}

	private void nextSpace() throws SymbolError {
		if (startsWith(MUST_SPACE)) {
			position += MUST_SPACE.length();
			status = DFAStatus.NULL;
		} else if (startsWith(TAB)) {
			position += TAB.length();
			status = DFAStatus.NULL;
		} else {
			throw new SymbolError("语法错误:" + tagChar[position]);
		}
	}

	public String getAttr(String name) {
		return result.get(name);
	}

	public TagParser(String str) {
		this.tagChar = str.toCharArray();
	}

	@Override
	public String toString() {
		return result.toString();
	}

	public static class SymbolError extends Exception {
		private static final long serialVersionUID = 2441411373778495898L;

		public SymbolError(String msg) {
			super(msg);
		}
	}

	public static class Entity {
		public Entity(String name) {
			this.name = name;
		}

		public String name;
		public String value;

		@Override
		public String toString() {
			return "[" + name + ":" + value + "]";
		}
	}

	public enum DFAStatus {
		UNSTART, START, TAG_NAME, SYMBOL_START, SYMBOL_END, DONE, NULL, EQ
	}

	public static TagParser parser(String htmlTag) throws SymbolError {
		TagParser tag = new TagParser(htmlTag);
		tag.parser();
		return tag;
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值