防止存储式XSS攻击过滤HTML
package net.sf.xsshtmlfilter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* HTML filtering utility for protecting against XSS (Cross Site Scripting).
*
* This code is licensed LGPLv3
*
* This code is a Java port of the original work in PHP by Cal Hendersen.
* http://code.iamcal.com/php/lib_filter/
*
* The trickiest part of the translation was handling the differences in regex handling
* between PHP and Java. These resources were helpful in the process:
*
* http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
* http://us2.php.net/manual/en/reference.pcre.pattern.modifiers.php
* http://www.regular-expressions.info/modifiers.html
*
* A note on naming conventions: instance variables are prefixed with a "v"; global
* constants are in all caps.
*
* Sample use:
* String input = ...
* String clean = new HTMLFilter().filter( input );
*
* The class is not thread safe. Create a new instance if in doubt.
*
* If you find bugs or have suggestions on improvement (especially regarding
* performance), please contact us. The latest version of this
* source, and our contact details, can be found at http://xss-html-filter.sf.net
*
* @author Joseph O'Connell
* @author Cal Hendersen
* @author Michael Semb Wever
*/
public final class HTMLFilter {
/** regex flag union representing /si modifiers in php **/
private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$", REGEX_FLAGS_SI);
private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
private static final Pattern P_END_ARROW = Pattern.compile("^>");
private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
private static final Pattern P_AMP = Pattern.compile("&");
private static final Pattern P_QUOTE = Pattern.compile("\"");
private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
// @xxx could grow large... maybe use sesat's ReferenceMap
private static final ConcurrentMap<String,Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<String, Pattern>();
private static final ConcurrentMap<String,Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<String, Pattern>();
/** set of allowed html elements, along with allowed attributes for each element **/
private final Map<String, List<String>> vAllowed;
/** counts of open tags for each (allowable) html element **/
private final Map<String, Integer> vTagCounts