import java.io.IOException;
import java.net.URL;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyXmlSerializer;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
//import com.sun.xml.internal.txw2.output.XmlSerializer;
/**
* 数据抓取
*/
public class HtmlClean {
@SuppressWarnings("deprecation")
/**
* 数据抓取
*/
public void cleanHtml(String htmlurl, String xmlurl, String xpath) throws XPatherException {
try {
//将目标网址内容抓取下来存到本地的XML文件中(格式化)
//long start = System.currentTimeMillis();
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseCdataForScriptAndStyle(false);
props.setRecognizeUnicodeChars(true);
props.setUseEmptyElementTags(true);
props.setAdvancedXmlEscape(true);
props.setTranslateSpecialEntities(true);
props.setBooleanAttributeValues("empty");
TagNode node = cleaner.clean(new URL(htmlurl));
// System.out.println(in);
//System.out.println(((TagNode) ns[0]).getText());
//System.out.println("vreme:" + (System.currentTimeMillis() - start));
new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);//格式化保存
String result = new PrettyXmlSerializer(props).getXmlAsString(node);
//System.out.println("vreme:" + (System.currentTimeMillis() - start));
System.out.println("*********************************************************");
//
//
//TagNode Xmlnode = cleaner.clean(new URL(xmlurl));//从已经格式化的XML文件中取出所要的数据
TagNode Xmlnode = cleaner.clean(new String(result));//从已格式化的String中取出所要的数据
Object[] ns = Xmlnode.getElementsByName("title", true); // 标题
if (ns.length > 0) {
System.out.println("title=" + ((TagNode) ns[0]).getText());
}
ns = Xmlnode.evaluateXPath(xpath); // 选取class为指定dixian1的所有td标签
// for (int i = 0; i < ns.length; i++) {
// String in = cleaner.getInnerHtml((TagNode) ns[i]);
// System.out.println("<span>" + in + "</span>");
// }
System.out.println("*********************************************************");
String in = cleaner.getInnerHtml((TagNode) ns[0]);
for(int i=0 ;i<ns.length ;i++){
in = cleaner.getInnerHtml((TagNode) ns[i]);
System.out.println(in);
if((i+1)%8==0){
System.out.println("*********************************************************");
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws XPatherException {
HtmlClean cleaner = new HtmlClean();
cleaner.cleanHtml("http://www.baidu.com", "E://test.xml","//div");
}
}
htmlCleaner 结合xpath转为html为xml并读取
最新推荐文章于 2021-06-03 04:46:13 发布