jsoup httpclient 爬取网页并下载google图标

最新推荐文章于 2022-06-24 21:21:25 发布

爬虫仔蛙

最新推荐文章于 2022-06-24 21:21:25 发布

阅读量901

点赞数

分类专栏： java

java 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

          Java代码   
          
        
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

          Java代码   
          
        
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

        Java代码   
        
      
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

      Java代码   
      
    
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

      Java代码   
      
    
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

jsoup httpclient

jsoup下载地址 http://www.jsoup.org

httpclient下载地址 http://hc.apache.org/downloads.cgi

其他jar包见附件

      Java代码   
      
    
 package jsoup;  
   
 import java.io.File;  
 import java.io.FileOutputStream;  
 import java.io.IOException;  
 import java.io.InputStream;  
 import java.util.HashMap;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.io.IOUtils;  
 import org.apache.http.HttpEntity;  
 import org.apache.http.HttpResponse;  
 import org.apache.http.HttpStatus;  
 import org.apache.http.client.methods.HttpGet;  
 import org.apache.http.impl.client.DefaultHttpClient;  
 import org.apache.http.params.HttpProtocolParams;  
 import org.apache.http.util.EntityUtils;  
   
 import com.google.api.translate.Language;  
 import com.google.api.translate.Translate;  
   
 /** 
  * google logo 下载程序 
  */  
 public abstract class Crawler {  
   
     /** 
      * 使用google 翻译api 
      *  
      * @param en 
      * @return 
      */  
     public String translateEnToCinese(String en) {  
         Translate.setHttpReferrer("http://www.xxx.com");  
         try {  
             return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
         return "";  
     }  
   
     /** 
      * 获取一个Map 
      *  
      * @return 
      */  
     public Map<String, Object> getMap() {  
         return new HashMap<String, Object>(0);  
     }  
   
     /** 
      * 下载文件 
      *  
      * @param url 
      *            文件http地址 
      * @param dir 
      *            目标文件 
      * @throws IOException 
      */  
     public void downloadFile(String url, String dir) throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient();  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         HttpGet httpGet = new HttpGet();  
         httpGet.setURI(new java.net.URI(url));  
           
         InputStream input = null;  
         FileOutputStream output = null;  
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             input = entity.getContent();  
             File file = new File(dir);  
             output = FileUtils.openOutputStream(file);  
             IOUtils.copy(input, output);  
         } catch (Exception e){  
             e.printStackTrace();  
         } finally {  
             IOUtils.closeQuietly(output);  
             IOUtils.closeQuietly(input);  
         }  
     }  
   
     /** 
      * 处理GET请求，返回整个页面 
      *  
      * @param url 
      *            访问地址 
      * @param params 
      *            编码参数 
      * @return 
      * @throws Exception 
      */  
     public synchronized String doGet(String url, String... params)  
             throws Exception {  
         DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
         HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                         "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
         String charset = "UTF-8";  
         if (null != params && params.length >= 1) {  
             charset = params[0];  
         }  
         HttpGet httpGet = new HttpGet(); // 创建get方法实例  
         String content = "";  
         httpGet.setURI(new java.net.URI(url));  
         try {  
             HttpResponse response = httpClient.execute(httpGet); // 执行请求，得到response对象  
             int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
             if (resStatu == HttpStatus.SC_OK) { // 200正常  
                 HttpEntity entity = response.getEntity(); // 获得相应的实体  
                 if (entity != null) {  
                     // 使用EntityUtils的toString方法，传递默认编码，在EntityUtils中的默认编码是ISO-8859-1  
                     content = EntityUtils.toString(entity, charset);  
                 }  
             }  
         } catch (Exception e) {  
             System.out.println("访问【" + url + "】出现异常!");  
             e.printStackTrace();  
         } finally {  
             // 关闭资源  
             httpGet.abort();  
             httpClient.getConnectionManager().shutdown();  
         }  
         return content;  
     }  
 }  

      Java代码   
      
    
 package jsoup;  
   
 import java.io.File;  
 import java.io.IOException;  
 import java.util.ArrayList;  
 import java.util.Date;  
 import java.util.List;  
 import java.util.Map;  
   
 import org.apache.commons.io.FileUtils;  
 import org.apache.commons.lang.StringUtils;  
 import org.json.JSONArray;  
 import org.json.JSONObject;  
 import org.jsoup.Jsoup;  
 import org.jsoup.nodes.Document;  
 import org.jsoup.nodes.Element;  
 import org.jsoup.select.Elements;  
   
 /** 
  * google logo 下载程序 
  */  
 public class GoogleLogoCrawler extends Crawler {  
       
     private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
   
     private static final String LOGO_URL = "http://www.logocollect.com/google/";  
   
     private static final String[] YEARS = new String[] {   
             //"1998", "1999", "2000",  
             //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
             "2009", "2010", "2011", "2012" };  
   
     private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
   
     private static final String DIR_PATH = "D:\\googlelogos\\";  
   
     public void doStart() {  
         JSONArray array = new JSONArray();  
         for (String year : YEARS) {  
             String ind = INDEX.replaceAll("%y", year);  
             int pageCount = getPageCount(ind);  
             for (int i = 1; i < pageCount+1; i++) {  
                 String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                 String path = year + "_" + i;  
                 start(url, array, DIR_PATH + path + "\\", path);  
             }  
         }  
         try {  
             FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         System.out.println(array);  
     }  
       
     public int getPageCount(String url) {  
         int pageCount = 1;  
         try {  
             org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
               
             String els = doc.html().toString();  
             int start = els.indexOf("总页数") + 4;  
             String temp = els.substring(start);  
             int end = temp.indexOf("，");  
             pageCount = Integer.parseInt(els.substring(start,start+end));  
             System.out.println(pageCount);  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return pageCount;  
     }  
   
     public void start(String url, JSONArray array, String dir, String path) {  
         try {  
             String content = super.doGet(url);  
             Document doc = Jsoup.parse(content);  
             Elements dds = doc.select(".img img");  
             List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
             for (int i = 0; i < dds.size(); i++) {  
                 Element img = dds.get(i);  
                 String src = img.select("img").first().attr("src");  
                 String title = img.select("img").first().attr("title");  
                 Map<String, Object> map = super.getMap();  
                   
                 map.put("url", LOGO_URL + src);  
                 map.put("title", title);  
                   
                 list.add(map);  
             }  
             JSONArray tempJsonArray = new JSONArray();  
             for (Map<String, Object> map : list) {  
                 JSONObject jsonObject = new JSONObject();  
                 String proxy = StringUtils.substringAfterLast(map.get("url")  
                         .toString(), ".");  
                 long date = new Date().getTime();  
                 String name = date + "." + proxy;  
                 jsonObject.put("url", map.get("url").toString());  
                 jsonObject.put("dir", name);  
                 jsonObject.put("title", map.get("title").toString());  
                   
                 // 翻译  
 //              String dateZh = super.translateEnToCinese(map.get("date")  
 //                      .toString());  
 //              String titleZh = super.translateEnToCinese(map.get("title")  
 //                      .toString());  
 //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                   
                 // 下载图片  
                 super.downloadFile(map.get("url").toString(), dir + name);  
                 tempJsonArray.put(jsonObject);  
             }  
             array.put(new JSONObject().put(path, tempJsonArray));  
         } catch (Exception e) {  
             e.printStackTrace();  
         }  
     }  
   
     public static void main(String[] args) throws Exception {  
         new GoogleLogoCrawler().doStart();  
     }  
   
 }  

爬虫仔蛙

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
jsoup httpclient 爬取网页并下载google图标

jsouphttpclient jsoup下载地址 http://www.jsoup.orghttpclient下载地址 http://hc.apache.org/downloads.cgi其他jar包见附件Java代码 package jsoup; import java.io.File; import java.
复制链接

扫一扫