htmlCleaner 结合xpath转为html为xml并读取

import java.io.IOException;  
import java.net.URL;  
import org.htmlcleaner.CleanerProperties;  
import org.htmlcleaner.HtmlCleaner;  
import org.htmlcleaner.PrettyXmlSerializer;  
import org.htmlcleaner.TagNode;  
import org.htmlcleaner.XPatherException;  
//import com.sun.xml.internal.txw2.output.XmlSerializer;  
  
/** 
 * 数据抓取 
 */  
public class HtmlClean {  
  
    @SuppressWarnings("deprecation")  
    /** 
     * 数据抓取 
     */  
    public void cleanHtml(String htmlurl, String xmlurl, String xpath) throws XPatherException {  
        try {  
            //将目标网址内容抓取下来存到本地的XML文件中(格式化)  
            //long start = System.currentTimeMillis();  
            HtmlCleaner cleaner = new HtmlCleaner();  
            CleanerProperties props = cleaner.getProperties();  
            props.setUseCdataForScriptAndStyle(false);  
            props.setRecognizeUnicodeChars(true);  
            props.setUseEmptyElementTags(true);  
            props.setAdvancedXmlEscape(true);  
            props.setTranslateSpecialEntities(true);  
            props.setBooleanAttributeValues("empty");  
            TagNode node = cleaner.clean(new URL(htmlurl));  
//          System.out.println(in);  
            //System.out.println(((TagNode) ns[0]).getText());  
            //System.out.println("vreme:" + (System.currentTimeMillis() - start));  
            new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);//格式化保存  
            String result = new PrettyXmlSerializer(props).getXmlAsString(node);  
            //System.out.println("vreme:" + (System.currentTimeMillis() - start));  
            System.out.println("*********************************************************");  
            //  
            //  
            //TagNode Xmlnode = cleaner.clean(new URL(xmlurl));//从已经格式化的XML文件中取出所要的数据  
            TagNode Xmlnode = cleaner.clean(new String(result));//从已格式化的String中取出所要的数据  
            Object[] ns = Xmlnode.getElementsByName("title", true); // 标题  
            if (ns.length > 0) {  
                System.out.println("title=" + ((TagNode) ns[0]).getText());  
            }  
            ns = Xmlnode.evaluateXPath(xpath); // 选取class为指定dixian1的所有td标签  
//          for (int i = 0; i < ns.length; i++) {  
//              String in = cleaner.getInnerHtml((TagNode) ns[i]);  
//              System.out.println("<span>" + in + "</span>");  
//          }  
            System.out.println("*********************************************************");  
            String in = cleaner.getInnerHtml((TagNode) ns[0]);  
            for(int i=0 ;i<ns.length ;i++){  
                in = cleaner.getInnerHtml((TagNode) ns[i]);  
                System.out.println(in);  
                if((i+1)%8==0){  
                    System.out.println("*********************************************************");  
                }  
            }  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
    }  
  
    public static void main(String[] args) throws XPatherException {  
        HtmlClean cleaner = new HtmlClean();  
        cleaner.cleanHtml("http://www.baidu.com", "E://test.xml","//div");  
    }  
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值