使用HttpClient4 和jsoup下载Google 1998年到2010年的logo(原创)

我个人比较喜欢Google的logo,感觉设计的很好,以前都是看到喜欢的logo就下载下来(通过浏览器右键图片另存为),最近看了java的jsoup(html解析器)突发奇想能不能把Google的logo全拿下来,通过分析Google的页面发现并不复杂,所以就决定把它拿下来了,在Google logos页面发现都是英文的,就想到了再利用Google翻译api对logo说明进行翻译下,最终把图片信息以及翻译结果用json保存下来,
最终将这些logo放到了我的导航189站点上供喜欢Google logo的网友欣赏,哈哈。
地址:
[url]http://www.dh189.com/p/logos/google/10_3.html[/url]

具体代码实现如下:
Crawler.java

package com.googlelogo;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.util.EntityUtils;

import com.google.api.translate.Language;
import com.google.api.translate.Translate;

/**
* google logo 下载程序
*
* http://www.dh189.com/
* @author ZZJ
*
*/
public abstract class Crawler {

/**
* 使用google 翻译api
* @param en
* @return
*/
public String translateEnToCinese(String en) {
Translate.setHttpReferrer("http://www.dh189.com");
try {
return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
} catch (Exception e) {
e.printStackTrace();
}
return "";
}

/**
* 获取一个Map
*
* @return
*/
public Map<String, Object> getMap() {
return new HashMap<String, Object>(0);
}

/**
* 下载文件
*
* @param url 文件http地址
* @param dir 目标文件
* @throws IOException
*/
public void downloadFile(String url, String dir) throws Exception {
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpProtocolParams.setUserAgent(httpclient.getParams(),
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
HttpGet httpget = new HttpGet();
httpget.setURI(new java.net.URI(url));
HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
InputStream input = null;
try {
input = entity.getContent();
File file = new File(dir);
FileOutputStream output = FileUtils.openOutputStream(file);
try {
IOUtils.copy(input, output);
} finally {
IOUtils.closeQuietly(output);
}
} finally {
IOUtils.closeQuietly(input);
}
}

/**
* 处理GET请求,返回整个页面
*
* @param url 访问地址
* @param params 编码参数
* @return
* @throws Exception
* @throws Exception
*/
public synchronized String doGet(String url, String... params) throws Exception {
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpProtocolParams.setUserAgent(httpclient.getParams(),
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
String charset = "UTF-8";
if (null != params && params.length >= 1) {
charset = params[0];
}
HttpGet httpget = new HttpGet();
String content = "";
httpget.setURI(new java.net.URI(url));
HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
if (entity != null) {
// 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
content = EntityUtils.toString(entity, charset);
}
httpget.abort();
httpclient.getConnectionManager().shutdown();
return content;
}
}


GoogleLogoCrawler.java

package com.googlelogo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
* google logo 下载程序
*
* http://www.dh189.com/
* @author ZZJ
*
*/
public class GoogleLogoCrawler extends Crawler {

private static final String URL = "http://www.google.com.hk/logos/logos%y-%n.html";

private static final String LOGO_URL = "http://www.google.com.hk";

private static final String[] YEARS = new String[] { "98", "99", "00", "01", "02", "03", "04", "05",
"06", "07", "08", "09", "10" };

private static final String INDEX = "http://www.google.com.hk/logos/index.html";

private static final String DIR_PATH="D:\\googlelogos\\";

public void doStart() {
JSONArray array = new JSONArray();
for (String year : YEARS) {
for (int i = 1; i < 5; i++) {
String url = URL.replaceAll("%y", year).replaceAll("%n", i + "");
String path = year + "_" + i;
start(url, array, DIR_PATH + path + "\\", path);
}
}
start(INDEX, array, DIR_PATH+"10_3\\", "10_3");
try {
FileUtils.writeStringToFile(new File(DIR_PATH+"json"), array.toString(), "UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(array);
}

public void start(String url, JSONArray array, String dir, String path) {
try {
String content = super.doGet(url);
Document doc = Jsoup.parse(content);
Elements dts = doc.select(".doodles dt");
Elements dds = doc.select(".doodles dd");
List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
for (int i = 0; i < dts.size(); i++) {
Element dt = dts.get(i);
Element dd = dds.get(i);
Map<String, Object> map = super.getMap();
list.add(map);
map.put("date", dt.text());
map.put("url", LOGO_URL + dd.select("img").first().attr("src"));
map.put("title", dd.text());
}
JSONArray array2 = new JSONArray();
for (Map<String, Object> map : list) {
JSONObject json = new JSONObject();
String proxy = StringUtils.substringAfterLast(map.get("url").toString(), ".");
long date = new Date().getTime();
String name = date + "." + proxy;
json.put("url", map.get("url").toString());
json.put("dir", name);
json.put("title_en", map.get("title"));
json.put("date", map.get("date"));
//翻译
String dateZh = super.translateEnToCinese(map.get("date").toString());
String titleZh = super.translateEnToCinese(map.get("title").toString());
json.put("title_zh_cn", dateZh + " - " + titleZh);
//下载图片
super.downloadFile(map.get("url").toString(), dir + name);
array2.put(json);
}
array.put(new JSONObject().put(path, array2));
} catch (Exception e) {
e.printStackTrace();
}
}

public static void main(String[] args) throws Exception {
new GoogleLogoCrawler().doStart();
}

}


下载下的文件如下:

[img]http://dl.iteye.com/upload/attachment/293076/96065e9c-526a-33af-adc8-14ff38bd0ca8.jpg[/img]

最终保存的json数据:


[img]http://dl.iteye.com/upload/attachment/293081/8cd17da1-507f-3aa1-ba3b-cdbb5e47b45e.jpg[/img]

在导航189上的显示(2010年)

[img]http://dl.iteye.com/upload/attachment/293092/89c45f3b-3c63-38ed-967f-c812e7a555f1.jpg[/img]
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值