利用neko抽取超链接及锚文本

最新推荐文章于 2024-08-05 15:12:37 发布

iteye_19224

最新推荐文章于 2024-08-05 15:12:37 发布

阅读量207

点赞数

文章标签： Java XML Apache .net HTML


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.HashMap;

import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;


public class Crawler {

	public static void main(String[] args) {
		String url = "http://www.sina.com.cn";
		getLinksByNeko(getPage(url, "gbk"));
	}
	public static String getPage(String url, String encoding){
		BufferedReader in = null;
		try {
			in = new BufferedReader(new InputStreamReader(  
			         new URL(url).openStream(),encoding));
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		CharBuffer bos = CharBuffer.allocate(20480);
		StringBuilder sb = new StringBuilder();
		try {
			while (in.read(bos) != -1) {
				bos.flip();
				sb.append(bos.toString());
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}finally{
			if(in != null)
				try {
					in.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
		}
		return sb.toString();
	}
	private static void getLinksByNeko(String Page) {
		DOMParser parser = new DOMParser();
		HashMap<String,String> map = new HashMap<String,String>();
		try {
			parser.setFeature("http://xml.org/sax/features/namespaces", false);
			StringReader sin = new StringReader(Page);
			parser.parse(new InputSource(sin));
			org.w3c.dom.Document doc = parser.getDocument();
			org.w3c.dom.NodeList products = org.apache.xpath.XPathAPI
					.selectNodeList(doc, "//A");
			org.w3c.dom.Node node = null;
			for (int i = 0; i < products.getLength(); i++) {
				node = products.item(i);		
				map.put(node.getAttributes().getNamedItem("href").getNodeValue(),node.getTextContent());
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.out.println(map.toString());
		System.out.println(map.size());
	}
}