使用状态分析HTML语法​

利用基于字符匹配的映射到状态的方式,实现解析html文档语法,匹配标签、文字、属性、值等。

列出了主要思路和主要代码。

if (DocStatus.Read_StartTag.equals(docStatus)) {
	if (TagStatus.Read_Tag_Start.equals(tagStatus)) {
		tagStatus = TagStatus.Read_Tag_Type;
		tagName = "";
		attrs.clear();
	}
	else if (TagStatus.Read_Tag_Type.equals(tagStatus)) {
		if (ch == '!') {
			tagType = "declare";
			tagStatus = TagStatus.Read_Tag_Name;
		}
		else if (ch == '/') {
			tagType = "close";
			tagStatus = TagStatus.Read_Tag_Name;
		}
		else if (ch == '>') {
			tagStatus = TagStatus.Read_Tag_End;
		}
		else if (Character.isLetter(ch)) {
			tagType = "open";
			tagStatus = TagStatus.Read_Tag_Name;
		}
			
	}
	else if (TagStatus.Read_Continue.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Continue;
		else if (ch == '/')
			tagStatus = TagStatus.Read_Tag_WillEnd;
		else {
			tagStatus = TagStatus.Read_Tag_AttrName;
			attrName = "";
			attrValue = "";
		}
	}
	else if (TagStatus.Read_Tag_AttrName_White.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Tag_AttrName_White;
		else if (ch == '=')
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
		else if (Character.isLetter(ch))
			tagStatus = TagStatus.Read_Tag_Name;
	}
	else if (TagStatus.Read_Tag_AttrValue_White.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Tag_AttrValue_White;
		else if (Character.isLetter(ch))
			tagStatus = TagStatus.Read_Tag_AttrValue;
	}
	else if (TagStatus.Read_Tag_WillAttrValue.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) 
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
		else
			tagStatus = TagStatus.Read_Tag_AttrValue;
	}
	
	if (TagStatus.Read_Tag_Name.equals(tagStatus)) {
		if (Character.isWhitespace(ch))
			tagStatus = TagStatus.Read_Continue;
		else if (ch == '>')
			tagStatus = TagStatus.Read_Tag_End;
		else 
			tagName += ch;
	}
	else if (TagStatus.Read_Tag_AttrName.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) {
			tagStatus = TagStatus.Read_Continue;
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
			attrName = "";
			attrValue = "";
		}
		else if (ch == '=') {
			tagStatus = TagStatus.Read_Tag_WillAttrValue;
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
		}
		else if (ch == '>') {
			attrs.add(new AbstractMap.SimpleEntry<>(attrName, null));
			tagStatus = TagStatus.Read_Tag_End;
		}
		else 
			attrName += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue.equals(tagStatus)) {
		if (Character.isWhitespace(ch)) {
			tagStatus = TagStatus.Read_Continue;
			AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
			e.setValue(attrValue);
		}
		else if (ch == '>') {
			tagStatus = TagStatus.Read_Tag_End;
			AbstractMap.SimpleEntry<String, String> e = attrs.getLast();
			e.setValue(attrValue);
		}
		else if (ch == '\'')
			tagStatus = TagStatus.Read_Tag_AttrValue_Quote;
		else if (ch == '"') 
			tagStatus = TagStatus.Read_Tag_AttrValue_DoubleQuote;
		else 
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue_Quote.equals(tagStatus)) {
		if (ch == '\'')
			tagStatus = TagStatus.Read_Tag_AttrValue;
		else
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_AttrValue_DoubleQuote.equals(tagStatus)) {
		if (ch == '"')
			tagStatus = TagStatus.Read_Tag_AttrValue;
		else 
			attrValue += ch;
	}
	else if (TagStatus.Read_Tag_WillEnd.equals(tagStatus)) {
		if (ch == '>') {
			tagType = "standard";
			tagStatus = TagStatus.Read_Tag_End;
		}
		else
			tagStatus = TagStatus.Read_Continue;
	}
	
	if (TagStatus.Read_Tag_End.equals(tagStatus)) {
		AbstractNode node = null;
		if (tagType.equals("declare")) {
			node = new DeclareNode(tagName);
			node.addAttrs(attrs);
			
			root.peek().addNode(node);
			stack.pop();
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);

		}
		else if (tagType.equals("open")) {
			node = new ElementNode(tagName);
			node.addAttrs(attrs);
			root.peek().addNode(node);
			root.push(node);
			
			if (tagName.toUpperCase().equals("META")) {
				root.pop();
				stack.pop();
				docStatus = DocStatus.Read_Any;
				stack.push(docStatus);
			}
			else if (tagName.toUpperCase().equals("SCRIPT")) {
				docStatus = DocStatus.Read_Script;
				stack.push(docStatus);
				tagStatus = TagStatus.Read_Script_Start;
				sb.setLength(0);
			}
			else {
				docStatus = DocStatus.Read_Any;
				stack.push(docStatus);
			}
		}
		else if (tagType.equals("standard")) {
			node = new ElementNode(tagName);
			node.addAttrs(attrs);
			root.peek().addNode(node);
			stack.pop();
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);
		}
		else if (tagType.equals("close")) {
			stack.pop();
			AbstractNode p = root.peek();
			if (tagName.equals("/"+p.getName())) {
				root.pop();
				stack.pop();
			}
			docStatus = DocStatus.Read_Any;
			stack.push(docStatus);
				
		}
	}
}
sb.append(ch);
if (DocStatus.Read_Any.equals(docStatus)) {
	//System.out.print(sb.toString());
	sb.setLength(0);
}

对简单html语法测试,显示如下:

状态转换方式的代码对HTML语法解析结果

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值