import java.io.BufferedReader;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.net.URL;
import org.cyberneko.html.parsers.DOMParser;
import org.htmlparser.tags.Span;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class JavaTest {
public static String TextExtractor(Node root){
//若是文本节点的话,直接返回
if (root.getNodeType() == Node.TEXT_NODE) {
return "";
}
if(root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;
//抛弃脚本
if (elmt.getTagName().equals("STYLE")
|| elmt.getTagName().equals("SCRIPT"))
return "";
NodeList children = elmt.getChildNodes();
StringBuilder text = new StringBuilder();
if (elmt.getTagName().equals("SPAN")) {
if (elmt.hasAttribute("id")) {
// System.out.println(elmt.getAttribute("id"));
if (elmt.getAttribute("id").equals("countOfPrd")) {
return children.item(0).getNextSibling().getNextSibling().getNextSibling().getFirstChild().getNodeValue();
}
}
}
for (int i = 0; i < children.getLength(); i++) {
text.append(TextExtractor(children.item(i)));
}
return text.toString();
}
//对其它类型的节点,返回空值
return "";
}
public static void main(String[] args) throws Exception{
//生成html parser
DOMParser parser = new DOMParser();
//设置网页的默认编码
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"utf-8");
//input file
URL a = new URL("http://www.suning.cn/webapp/wcs/stores/servlet/prd_10052_10051_-7_9173_196583_.html");
BufferedReader in = new BufferedReader(new InputStreamReader(a.openStream()));
//BufferedReader in = new BufferedReader(new FileReader("input.htm"));
parser.parse(new InputSource(in));
Document doc = parser.getDocument();
//获得body节点,以此为根,计算其文本内容
Node body = doc.getElementsByTagName("BODY").item(0);
System.out.println(TextExtractor(body));
}
}
//该段代码是用来解析抓取各大网站的价格,来智能化定价。实现价格战的目的赢得用户,不懂或更多探究QQ526151410