1、安装引用HtmlCleaner和Saxon
Maven中Pom.xml中添加依赖:
net.sourceforge.htmlcleaner
htmlcleaner
net.sf.saxon
Saxon-HE
2、使用HtmlCleaner、Saxon和XPath(XPathEvaluator)示例代码
package us.codecraft.webmagic.selector;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator;
import org.htmlcleaner.CleanerProperties;
import org..DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
*