org.jsoup.safety.Whitelist是个白名单, 定义了什么html元素或者属性可以通过, 而其他的所有内容都将被删除。
org.jsoup.safety.Cleaner是清理器,在创建Cleaner对象时告诉他白名单是什么,然后就可以用于请理危险元素和脚本了。
如下所示,清理document里的危险元素和脚本。
Cleaner cleaner = new Cleaner(Whitelist.relaxed());
document = cleaner.clean(document);
上列代码使用了Whitelist内置的一份白名单,当然也可以自定义
public class MyWhitelist extends Whitelist {
public MyWhitelist() {
this.addTags(new String[]{"a", "abbr", "address", "area", "article", "aside", "audio", "b", "bdi", "bdo", "big", "blockquote", "br", "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "details", "div", "dl", "dt", "em", "font", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "i", "img", "ins", "li", "mark", "nav", "ol", "p", "pre", "q", "s", "small", "span", "strike", "strong", "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "video"});
this.addAttributes("a", new String[]{"target", "href", "title"});
this.addAttributes("abbr", new String[]{"title"});
this.addAttributes("area", new String[]{"shape", "coords", "href", "alt"});
this.addAttributes("audio", new String[]{"autoplay", "controls", "loop", "preload", "src"});
this.addAttributes("bdi", new String[]{"dir"});
this.addAttributes("bdo", new String[]{"dir"});
this.addAttributes("blockquote", new String[]{"cite"});
this.addAttributes("col", new String[]{"align", "valign", "span"});
this.addAttributes("colgroup", new String[]{"align", "valign", "span"});
this.addAttributes("del", new String[]{"datetime"});
this.addAttributes("details", new String[]{"open"});
this.addAttributes("font", new String[]{"color", "size", "face"});
this.addAttributes("img", new String[]{"align", "alt", "src", "title", "_src", "loadingclass", "data-latex"});
this.addAttributes("ins", new String[]{"datetime"});
this.addAttributes("ol", new String[]{"start"});
this.addAttributes("q", new String[]{"cite"});
this.addAttributes("table", new String[]{"summary", "border", "align", "valign"});
this.addAttributes("tbody", new String[]{"align", "valign"});
this.addAttributes("tfoot", new String[]{"align", "valign"});
this.addAttributes("thead", new String[]{"align", "valign"});
this.addAttributes("td", new String[]{"abbr", "axis", "colspan", "rowspan", "align", "valign"});
this.addAttributes("th", new String[]{"abbr", "axis", "colspan", "rowspan", "scope", "align", "valign"});
this.addAttributes("tr", new String[]{"rowspan", "align", "valign"});
this.addAttributes("video", new String[]{"src", "autoplay", "controls", "loop", "muted", "poster", "preload"});
this.addAttributes(":all", new String[]{"id"});
this.addAttributes(":all", new String[]{"name"});
this.addAttributes(":all", new String[]{"class"});
this.addAttributes(":all", new String[]{"style"});
this.addAttributes(":all", new String[]{"height"});
this.addAttributes(":all", new String[]{"width"});
this.addAttributes(":all", new String[]{"type"});
this.addProtocols("a", "href", new String[]{"ftp", "http", "https", "mailto", "tel"});
this.addProtocols("blockquote", "cite", new String[]{"http", "https"});
this.addProtocols("cite", "cite", new String[]{"http", "https"});
this.addProtocols("img", "src", new String[]{"http", "https"});
this.addProtocols("q", "cite", new String[]{"http", "https"});
}
protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
boolean bl = super.isSafeAttribute(tagName, el, attr);
if (bl) {
return bl;
} else if ("img".equals(tagName) && "src".equals(attr.getKey())) {
String value = attr.getValue();
return value == null || !value.toLowerCase().contains("javascript:");
} else {
return bl;
}
}
}