htmlCleaner 结合xpath转为html为xml并读取

最新推荐文章于 2021-06-03 04:46:13 发布

无限寂寞

最新推荐文章于 2021-06-03 04:46:13 发布

阅读量1.6k

点赞数

分类专栏： html 文章标签： xml html string import class object

html 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import java.io.IOException;  
import java.net.URL;  
import org.htmlcleaner.CleanerProperties;  
import org.htmlcleaner.HtmlCleaner;  
import org.htmlcleaner.PrettyXmlSerializer;  
import org.htmlcleaner.TagNode;  
import org.htmlcleaner.XPatherException;  
//import com.sun.xml.internal.txw2.output.XmlSerializer;  
  
/** 
 * 数据抓取 
 */  
public class HtmlClean {  
  
    @SuppressWarnings("deprecation")  
    /** 
     * 数据抓取 
     */  
    public void cleanHtml(String htmlurl, String xmlurl, String xpath) throws XPatherException {  
        try {  
            //将目标网址内容抓取下来存到本地的XML文件中（格式化）  
            //long start = System.currentTimeMillis();  
            HtmlCleaner cleaner = new HtmlCleaner();  
            CleanerProperties props = cleaner.getProperties();  
            props.setUseCdataForScriptAndStyle(false);  
            props.setRecognizeUnicodeChars(true);  
            props.setUseEmptyElementTags(true);  
            props.setAdvancedXmlEscape(true);  
            props.setTranslateSpecialEntities(true);  
            props.setBooleanAttributeValues("empty");  
            TagNode node = cleaner.clean(new URL(htmlurl));  
//          System.out.println(in);  
            //System.out.println(((TagNode) ns[0]).getText());  
            //System.out.println("vreme:" + (System.currentTimeMillis() - start));  
            new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);//格式化保存  
            String result = new PrettyXmlSerializer(props).getXmlAsString(node);  
            //System.out.println("vreme:" + (System.currentTimeMillis() - start));  
            System.out.println("*********************************************************");  
            //  
            //  
            //TagNode Xmlnode = cleaner.clean(new URL(xmlurl));//从已经格式化的XML文件中取出所要的数据  
            TagNode Xmlnode = cleaner.clean(new String(result));//从已格式化的String中取出所要的数据  
            Object[] ns = Xmlnode.getElementsByName("title", true); // 标题  
            if (ns.length > 0) {  
                System.out.println("title=" + ((TagNode) ns[0]).getText());  
            }  
            ns = Xmlnode.evaluateXPath(xpath); // 选取class为指定dixian1的所有td标签  
//          for (int i = 0; i < ns.length; i++) {  
//              String in = cleaner.getInnerHtml((TagNode) ns[i]);  
//              System.out.println("<span>" + in + "</span>");  
//          }  
            System.out.println("*********************************************************");  
            String in = cleaner.getInnerHtml((TagNode) ns[0]);  
            for(int i=0 ;i<ns.length ;i++){  
                in = cleaner.getInnerHtml((TagNode) ns[i]);  
                System.out.println(in);  
                if((i+1)%8==0){  
                    System.out.println("*********************************************************");  
                }  
            }  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
    }  
  
    public static void main(String[] args) throws XPatherException {  
        HtmlClean cleaner = new HtmlClean();  
        cleaner.cleanHtml("http://www.baidu.com", "E://test.xml","//div");  
    }  
}