可以设置最大爬取的网页数,并且自动添加网页链接 package com.hudson.test; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /* * 网页爬虫1.0.0 * 存在问题:字符集有时无法获取导致存储网页编码出错 * author:hudson */ public class WebCrawler { private List<String> urls; private Set<String> finishedUrls; private int count; private String fileDir; private int max; public WebCrawler(){ urls = new ArrayList<String>(); finishedUrls = new HashSet<String>(); count = 0; fileDir = "c://"; max = 100; } /* * 抓取一个网页并返回网页字符串 */ public String accessUrl(String url){ StringBuffer sb = new StringBuffer(); try{ URL u = new URL(url); HttpURLConnection huc = (HttpURLConnection)u.openConnection(); huc.setConnectTimeout(3000); huc.setReadTimeout(3000); String charset = getCharset(huc.getContentType());//获取字符集,但经常获取不到 BufferedReader br = null; if(!"".equals(charset)) br = new BufferedReader(new InputStreamReader(huc.getInputStream(),charset)); else br = new BufferedReader(new InputStreamReader(huc.getInputStream())); huc.connect(); String tmp = null; while((tmp = br.readLine())!= null){ sb.append(tmp); } huc.disconnect(); br.close(); }catch(Exception e){ e.printStackTrace(); } String content = sb.toString(); int index = 0; while((index = content.indexOf("href=/""))!= -1){ content = content.substring(index+6); String tmpUrl = content.substring(0,content.indexOf("/"")); if(tmpUrl.startsWith("http")) urls.add(tmpUrl); } count++; return sb.toString(); } /* * 获取网页的字符集 */ public String getCharset(String s){ String contentType = s; String[] values = contentType.split(";"); String charset = ""; for (String value : values) { value = value.trim(); if (value.toLowerCase().startsWith("charset=")) { charset = value.substring("charset=".length()); } } return charset; } public void addUrl(String url){ urls.add(url); } public void setFileDir(String dir){ fileDir = dir; } public void setMax(int m){ max = m; } /* * 循环爬取直到计数器达到最大值 */ public void work(){ while(count < max && !urls.isEmpty()){ String url = urls.get(0); if(finishedUrls.contains(url)){ urls.remove(0); continue; } save(accessUrl(url)); urls.remove(0); finishedUrls.add(url); } } /* * 存储网页内容 */ public void save(String s){ try { File f = new File(fileDir); if(!f.exists()){ f.mkdirs(); } FileWriter fw = new FileWriter(fileDir+String.valueOf(count)+".txt",false); fw.write(s); fw.close(); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args){ WebCrawler wc = new WebCrawler(); wc.setFileDir("c://mytest//"); wc.addUrl("http://www.hao123.com"); wc.work(); System.out.println("finished"); } }