利用neko抽取超链接及锚文本


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.HashMap;

import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;


public class Crawler {

public static void main(String[] args) {
String url = "http://www.sina.com.cn";
getLinksByNeko(getPage(url, "gbk"));
}
public static String getPage(String url, String encoding){
BufferedReader in = null;
try {
in = new BufferedReader(new InputStreamReader(
new URL(url).openStream(),encoding));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
CharBuffer bos = CharBuffer.allocate(20480);
StringBuilder sb = new StringBuilder();
try {
while (in.read(bos) != -1) {
bos.flip();
sb.append(bos.toString());
}
} catch (IOException e1) {
e1.printStackTrace();
}finally{
if(in != null)
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
private static void getLinksByNeko(String Page) {
DOMParser parser = new DOMParser();
HashMap<String,String> map = new HashMap<String,String>();
try {
parser.setFeature("http://xml.org/sax/features/namespaces", false);
StringReader sin = new StringReader(Page);
parser.parse(new InputSource(sin));
org.w3c.dom.Document doc = parser.getDocument();
org.w3c.dom.NodeList products = org.apache.xpath.XPathAPI
.selectNodeList(doc, "//A");
org.w3c.dom.Node node = null;
for (int i = 0; i < products.getLength(); i++) {
node = products.item(i);
map.put(node.getAttributes().getNamedItem("href").getNodeValue(),node.getTextContent());
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(map.toString());
System.out.println(map.size());
}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值