import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.HashMap;
import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;
public class Crawler {
public static void main(String[] args) {
String url = "http://www.sina.com.cn";
getLinksByNeko(getPage(url, "gbk"));
}
public static String getPage(String url, String encoding){
BufferedReader in = null;
try {
in = new BufferedReader(new InputStreamReader(
new URL(url).openStream(),encoding));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
CharBuffer bos = CharBuffer.allocate(20480);
StringBuilder sb = new StringBuilder();
try {
while (in.read(bos) != -1) {
bos.flip();
sb.append(bos.toString());
}
} catch (IOException e1) {
e1.printStackTrace();
}finally{
if(in != null)
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
private static void getLinksByNeko(String Page) {
DOMParser parser = new DOMParser();
HashMap<String,String> map = new HashMap<String,String>();
try {
parser.setFeature("http://xml.org/sax/features/namespaces", false);
StringReader sin = new StringReader(Page);
parser.parse(new InputSource(sin));
org.w3c.dom.Document doc = parser.getDocument();
org.w3c.dom.NodeList products = org.apache.xpath.XPathAPI
.selectNodeList(doc, "//A");
org.w3c.dom.Node node = null;
for (int i = 0; i < products.getLength(); i++) {
node = products.item(i);
map.put(node.getAttributes().getNamedItem("href").getNodeValue(),node.getTextContent());
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(map.toString());
System.out.println(map.size());
}
}
利用neko抽取超链接及锚文本
最新推荐文章于 2024-08-05 15:12:37 发布