解析xml文档构造html,解析格式错误的XML文档(如HTML文件)

public class HtmlSanitizer {

private HtmlSanitizer() {

}

private static final Set VALID_ELEMENTS = Sets.newHashSet(DIV, BR,

P, B, I, OL, UL, LI, A, STRONG, SPAN, EM, TT, IMG);

private static final Set VALID_ATTRIBUTES = Sets.newHashSet("id",

"class", "href", "target", "title", "src");

private static final Object VALID_MARKER = new Object();

public static void sanitize(Reader r, Writer w) {

try {

sanitize(new Source(r)).writeTo(w);

w.flush();

r.close();

} catch (IOException ioe) {

throw new RuntimeException("error during sanitize", ioe);

}

}

public static OutputDocument sanitize(Source source) {

source.fullSequentialParse();

OutputDocument doc = new OutputDocument(source);

List tags = source.getAllTags();

int pos = 0;

for (Tag tag : tags) {

if (processTag(tag, doc))

tag.setUserData(VALID_MARKER);

else

doc.remove(tag);

reencodeTextSegment(source, doc, pos, tag.getBegin());

pos = tag.getEnd();

}

reencodeTextSegment(source, doc, pos, source.getEnd());

return doc;

}

private static boolean processTag(Tag tag, OutputDocument doc) {

String elementName = tag.getName();

if (!VALID_ELEMENTS.contains(elementName))

return false;

if (tag.getTagType() == StartTagType.NORMAL) {

Element element = tag.getElement();

if (HTMLElements.getEndTagRequiredElementNames().contains(

elementName)) {

if (element.getEndTag() == null)

return false;

} else if (HTMLElements.getEndTagOptionalElementNames().contains(

elementName)) {

if (elementName == HTMLElementName.LI && !isValidLITag(tag))

return false;

if (element.getEndTag() == null)

doc.insert(element.getEnd(), getEndTagHTML(elementName));

}

doc.replace(tag, getStartTagHTML(element.getStartTag()));

} else if (tag.getTagType() == EndTagType.NORMAL) {

if (tag.getElement() == null)

return false;

if (elementName == HTMLElementName.LI && !isValidLITag(tag))

return false;

doc.replace(tag, getEndTagHTML(elementName));

} else {

return false;

}

return true;

}

private static boolean isValidLITag(Tag tag) {

Element parentElement = tag.getElement().getParentElement();

if (parentElement == null

|| parentElement.getStartTag().getUserData() != VALID_MARKER)

return false;

return parentElement.getName() == HTMLElementName.UL

|| parentElement.getName() == HTMLElementName.OL;

}

private static void reencodeTextSegment(Source source, OutputDocument doc,

int begin, int end) {

if (begin >= end)

return;

Segment textSegment = new Segment(source, begin, end);

String encodedText = encode(decode(textSegment));

doc.replace(textSegment, encodedText);

}

private static CharSequence getStartTagHTML(StartTag startTag) {

StringBuilder sb = new StringBuilder();

sb.append('

for (Attribute attribute : startTag.getAttributes()) {

if (VALID_ATTRIBUTES.contains(attribute.getKey())) {

sb.append(' ').append(attribute.getName());

if (attribute.getValue() != null) {

sb.append("="");

sb.append(CharacterReference.encode(attribute.getValue()));

sb.append('"');

}

}

}

if (startTag.getElement().getEndTag() == null

&& !HTMLElements.getEndTagOptionalElementNames().contains(

startTag.getName()))

sb.append('/');

sb.append('>');

return sb;

}

private static String getEndTagHTML(String tagName) {

return "" + tagName + '>';

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值