推荐 - Jsoup(附网页批量抓取例子)

目标:做一个简单的网站爬虫(怎么听怎么象virus。。。),访问父网站下的超链接,提取里面的文本内容。


开始时,手工写HTML的标签解析,部分代码如下:

	/** 
     * 获取Href List分析结果 
     * 
     * @return List<String>
     * @throws IOException
     */ 
    public List<String> getHrefList() throws IOException { 
    	Set<String> rawHrefList = parseHref();
    	hrefList = rewriteHref(rawHrefList);
        return hrefList; 
    }
    
	/** 
     * 解析网页链接 
     * 
     * @return List<String>
     * @throws IOException 
     */ 
    private List<String> parserHref() throws IOException {
    	List<String> rawHrefList = new ArrayList<String>();
        URL url = new URL(parentUrl); 
        HttpURLConnection connection = (HttpURLConnection) url.openConnection(); 
        connection.setDoOutput(true);
        InputStreamReader isr = new InputStreamReader( 
                connection.getInputStream(), getCharset(connection.getContentType())); 
        BufferedReader br = new BufferedReader(isr); 
        String str = null;
        List<String> rs = null; 
        while ((str = br.readLine()) != null) { 
            rs = getHref(str); 
            if (rs != null) 
            	rawHrefList.addAll(rs); 
        }
        br.close();
        isr.close();
        connection.disconnect();
        return rawHrefList;
    } 
    
    /** 
     * 获取网页编码方式 
     * 
     * @param str
     * @return String
     */ 
    private String getCharset(String str) { 
        Pattern pattern = Pattern.compile("charset=.*"); 
        Matcher matcher = pattern.matcher(str); 
        if (matcher.find()) 
            return matcher.group(0).split("charset=")[1]; 
        return null; 
    }
    
    /** 
     * 从一行字符串中读取多条链接
     * 
     * @param str
     * @return List<String>
     */ 
    private List<String> getHref(String str) { 
    	List<String> hrefSet = new ArrayList<String>();
        Pattern pattern = Pattern.compile("href=\".*?\"");
        final int URL_START_POSITION = 6;
        while(str.length() > 0) {
	        Matcher matcher = pattern.matcher(str); 
	        if (matcher.find()) {
	        	// 从href中截取URL
	        	hrefSet.add(matcher.group(0).substring(URL_START_POSITION, matcher.group(0).length()-1));
	        	// 去掉已读取字符串
	        	str = str.substring(matcher.end());
	        } else {
	        	str = "";
	        }
        }
        if (hrefSet.size() > 0)
        	return hrefSet;
        return null; 
    }
    
    /** 
     * 改写
     * 
     * @param hrefList
     * @return List<String>
     */ 
    private List<String> rewriteHref(List<String> hrefList) {
    	List<String> distinctHrefList = distinct(hrefList);
    	return distinctByParent(distinctHrefList);
    }
    
    /** 
     * 去重:去掉重复链接
     * 
     * @param hrefList
     * @return List<String>
     */ 
    private List<String> distinct(List<String> hrefList) {
    	List<String> distinctHrefList = new ArrayList<String>();
    	Set<String> hrefSet = new HashSet<String>();
    	Iterator<String> hrefItr = hrefList.iterator();
    	while (hrefItr.hasNext()) {
    		String hrefStr = (String)hrefItr.next();
    		hrefSet.add(hrefStr);
    	}
    	distinctHrefList.addAll(hrefSet);
    	return distinctHrefList;
    }
    

做着做着就觉得好麻烦呀,做语义分析,各种情况都要考虑,我的目标不是做一个分词工具出来,需要找个好工具才行,网上搜了一下,开始时看到html parser,不过发现代码都是2006年的,忽略,然后又看到Jsoup 和 Jericho HTML Parser,两者看起来半斤八两,不过Jericho 名字好难记,Jsoup 网站和文档好像丰富一些,OK,决定使用Jsoup 。用了才发现,太好用啦,代码清楚多了,开发效率提高,推荐!使用Jsoup后的全部代码如下(注意:现在的服务器有保护机制,同ip瞬间大量访问会被block):

package j2seTest2;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HtmlParser {
	
	final int THREAD_COUNT = 10;
	private CountDownLatch threadCompletedCounter = new CountDownLatch(THREAD_COUNT);
	AtomicInteger paragraphCounter = new AtomicInteger(0);
	
	public static void main(String[] args) {
//		testParseHref();
//		testGetHrefList();
//		testParseParagraph();
//		testCrawlParagraph();
		testCrawlParagraphInMultiThreads();
	}
	
	/** 
     * 获取Href List分析结果 
     * 
     * @param urlStr
     * @return List<String>
     */ 
    public List<String> getHrefList(String urlStr) { 
    	Set<String> rawHrefList = parseHref(urlStr);
        return rewriteHref(rawHrefList, urlStr); 
    }
	
	/** 
     * 解析网页链接, 去掉重复链接
     * 
     * @param urlStr
     * @return Set<String>
     */ 
    private Set<String> parseHref(String urlStr) {
    	Set<String> rawHrefSet = new HashSet<String>();
    	Document doc;
		try {
			doc = Jsoup.connect(urlStr).timeout(5000).get();
	        Elements elements = doc.getElementsByAttribute("href");
	        Iterator<Element> eleItr = elements.iterator();
	        while(eleItr.hasNext()) {
	        	Element element = eleItr.next();
	        	rawHrefSet.add(element.attr("href"));
	        }
		} catch (IOException e) {
			e.printStackTrace();
		}
        return rawHrefSet;
    } 
    
    /** 
     * 改写
     * 
     * @param hrefList, parentUrl
     * @return List<String>
     */ 
    private List<String> rewriteHref(Set<String> hrefList, String parentUrl) {
    	return distinctByParent(hrefList, parentUrl);
    }
    
    /** 
     * 去重:只保留父链接开头的URL
     * 
     * @param hrefList, parentUrl
     * @return List<String>
     */ 
    private List<String> distinctByParent(Set<String> hrefList, String parentUrl) {
    	ArrayList<String> distinctHrefList = new ArrayList<String>();
    	Iterator<String> hrefItr = hrefList.iterator();
    	while (hrefItr.hasNext()) {
    		String hrefStr = (String)hrefItr.next();
    		if (hrefStr.indexOf(parentUrl) >= 0) {
    			distinctHrefList.add(hrefStr);
    		}
    	}
    	return distinctHrefList;
    }
    
	/** 
     * 获取网页段落内容
     * 
     * @param urlStr
     * @return String
     */ 
    public String getParagraph(String urlStr) { 
    	return parseParagraph(urlStr);
    }
    
    /** 
     * 解析网页段落内容
     * 
     * @param urlStr
     * @return String
     */ 
    private String parseParagraph(String urlStr) {
    	String paragraph = "";
    	final String HTML_TAG_P = "p";
    	final String HTML_TAG_FONT = "font";
        Document doc;
		try {
			doc = Jsoup.connect(urlStr).timeout(5000).get();
	        paragraph = paragraph.concat(getTextByTag(doc, HTML_TAG_P));
	        paragraph = paragraph.concat(getTextByTag(doc, HTML_TAG_FONT));
		} catch (IOException e) {
			e.printStackTrace();
		}
        return paragraph;
    }
    
    /** 
     * get element's text by tag
     * 
     * @param doc, tag
     * @return String
     */ 
    private String getTextByTag(Document doc, String tag) {
    	String text = "";
    	Elements elements = doc.getElementsByTag(tag);
        Iterator<Element> eleItr = elements.iterator();
        while(eleItr.hasNext()) {
        	Element element = eleItr.next();
        	text = text.concat(element.text());
        }
        return text;
    }
    
    /** 
     * crawl element's text in multi-thread
     * 
     * @param htmlParser, hrefList
     */ 
    private void crawlInMultiThreads(final HtmlParser htmlParser, final List<String> hrefList) {
    	ExecutorService executor = Executors.newFixedThreadPool(THREAD_COUNT);
    	final Map<Integer, List<String>> hrefMap = new HashMap<Integer, List<String>>();
    	int sizeOfHrefSubList = hrefList.size() / THREAD_COUNT + 1;
    	int startIndexOfHrefSubList = 0;
    	int endIndexOfHrefSubList = sizeOfHrefSubList;
    	Integer mapIndex = 0;
    	// 把hrefList 按线程数分为若干个subList,再subList 存储到map中
    	for(;;) {
    		int hrefSubListIndex = 0;
    		List<String> hrefSubList = new ArrayList<String>();
	    	for (int index = startIndexOfHrefSubList; index < endIndexOfHrefSubList; index++) {
	    		hrefSubList.add(hrefSubListIndex, hrefList.get(index));
	    		hrefSubListIndex++;
	    	}
	    	hrefMap.put(mapIndex, hrefSubList);
	    	mapIndex++;
	    	if (endIndexOfHrefSubList == hrefList.size()) {
	    		break;
	    	}
	    	startIndexOfHrefSubList = endIndexOfHrefSubList;
	    	if (hrefList.size() - endIndexOfHrefSubList > sizeOfHrefSubList) {
	    		endIndexOfHrefSubList += sizeOfHrefSubList;
	    	} else {
	    		endIndexOfHrefSubList = hrefList.size();
	    	}
    	}
    	
    	// 一个线程处理map中的一个entry
    	for (int threadIndex = 0; threadIndex < THREAD_COUNT; threadIndex++) {
    		final Iterator<String> hrefItr = hrefMap.get(Integer.valueOf(threadIndex)).iterator();
    		executor.submit(new Runnable() {
				public void run() {
					// 每个entry有若干个href url去处理
					while (hrefItr.hasNext()) {
						synchronized (hrefItr) {
							String paragraph = htmlParser.getParagraph(hrefItr.next());
							paragraphCounter.incrementAndGet();
							System.out.println("paragraphCounter:" + paragraphCounter);
							System.out.println(paragraph);
							System.out.println("");
						}
					}
					threadCompletedCounter.countDown();
				}
			});
    	}
    	closeThreadPool(executor);
    }
    
    private void closeThreadPool(final ExecutorService executor) {
		try {
			threadCompletedCounter.await();
			executor.shutdown();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
    
    
    private static void testParseHref() {
    	HtmlParser a = new HtmlParser();
    	Set<String> hrefStr;
		String parentUrl = "http://www.defense.gov/bios/";
		hrefStr = a.parseHref(parentUrl);
    	Iterator<String> hrefItr = hrefStr.iterator();
    	while (hrefItr.hasNext()) {
    		System.out.println(hrefItr.next());
    	}
    }
    private static void testGetHrefList() {
    	HtmlParser a = new HtmlParser(); 
        List<String> hrefList;
		hrefList = a.getHrefList("http://www.defense.gov/bios/");
		System.out.println("url count:" + hrefList.size());
        for (int i = 0; i < hrefList.size(); i++) 
            System.out.println(hrefList.get(i));
    }
    private static void testParseParagraph() {
    	HtmlParser a = new HtmlParser(); 
    	String paragraph;
//		paragraph = a.parseParagraph("http://www.defense.gov/bios/biographydetail.aspx?biographyid=430");  //<p>
		paragraph = a.parseParagraph("http://www.defense.gov/bios/biographydetail.aspx?biographyid=185");  //<font>
		System.out.println("paragraph:");
		System.out.println(paragraph);
    }
    private static void testCrawlParagraph() {
    	HtmlParser a = new HtmlParser(); 
        List<String> hrefList;
		hrefList = a.getHrefList("http://www.defense.gov/bios/");
		Iterator<String> hrefItr = hrefList.iterator();
		int hrefCounter = 1;
		while (hrefItr.hasNext()) {
			String hrefUrl = (String) hrefItr.next();
			String paragraph = a.getParagraph(hrefUrl);
			System.out.println("paragraph " + hrefCounter + " from " + hrefUrl + ":");
			System.out.println(paragraph);
			System.out.println("");
			hrefCounter++;
		}
    }
    private static void testCrawlParagraphInMultiThreads() {
    	HtmlParser a = new HtmlParser(); 
    	List<String> hrefList = a.getHrefList("http://www.defense.gov/bios/");
    	a.crawlInMultiThreads(a, hrefList);
    }

}

其它关于爬虫的介绍参考:
Java爬虫:http://blog.csdn.net/u012730840/article/details/19985897



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值