jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }
jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }
jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }
jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }
jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }
jsoup下载地址 http://www.jsoup.org
httpclient下载地址 http://hc.apache.org/downloads.cgi
其他jar包见附件
- package jsoup;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.io.IOUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpStatus;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.util.EntityUtils;
- import com.google.api.translate.Language;
- import com.google.api.translate.Translate;
- /**
- * google logo 下载程序
- */
- public abstract class Crawler {
- /**
- * 使用google 翻译api
- *
- * @param en
- * @return
- */
- public String translateEnToCinese(String en) {
- Translate.setHttpReferrer("http://www.xxx.com");
- try {
- return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 获取一个Map
- *
- * @return
- */
- public Map<String, Object> getMap() {
- return new HashMap<String, Object>(0);
- }
- /**
- * 下载文件
- *
- * @param url
- * 文件http地址
- * @param dir
- * 目标文件
- * @throws IOException
- */
- public void downloadFile(String url, String dir) throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient();
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- HttpGet httpGet = new HttpGet();
- httpGet.setURI(new java.net.URI(url));
- InputStream input = null;
- FileOutputStream output = null;
- try {
- HttpResponse response = httpClient.execute(httpGet);
- HttpEntity entity = response.getEntity();
- input = entity.getContent();
- File file = new File(dir);
- output = FileUtils.openOutputStream(file);
- IOUtils.copy(input, output);
- } catch (Exception e){
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(output);
- IOUtils.closeQuietly(input);
- }
- }
- /**
- * 处理GET请求,返回整个页面
- *
- * @param url
- * 访问地址
- * @param params
- * 编码参数
- * @return
- * @throws Exception
- */
- public synchronized String doGet(String url, String... params)
- throws Exception {
- DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
- HttpProtocolParams.setUserAgent(httpClient.getParams(),
- "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
- String charset = "UTF-8";
- if (null != params && params.length >= 1) {
- charset = params[0];
- }
- HttpGet httpGet = new HttpGet(); // 创建get方法实例
- String content = "";
- httpGet.setURI(new java.net.URI(url));
- try {
- HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
- int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
- if (resStatu == HttpStatus.SC_OK) { // 200正常
- HttpEntity entity = response.getEntity(); // 获得相应的实体
- if (entity != null) {
- // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
- content = EntityUtils.toString(entity, charset);
- }
- }
- } catch (Exception e) {
- System.out.println("访问【" + url + "】出现异常!");
- e.printStackTrace();
- } finally {
- // 关闭资源
- httpGet.abort();
- httpClient.getConnectionManager().shutdown();
- }
- return content;
- }
- }
- package jsoup;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Map;
- import org.apache.commons.io.FileUtils;
- import org.apache.commons.lang.StringUtils;
- import org.json.JSONArray;
- import org.json.JSONObject;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * google logo 下载程序
- */
- public class GoogleLogoCrawler extends Crawler {
- private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";
- private static final String LOGO_URL = "http://www.logocollect.com/google/";
- private static final String[] YEARS = new String[] {
- //"1998", "1999", "2000",
- //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",
- "2009", "2010", "2011", "2012" };
- private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";
- private static final String DIR_PATH = "D:\\googlelogos\\";
- public void doStart() {
- JSONArray array = new JSONArray();
- for (String year : YEARS) {
- String ind = INDEX.replaceAll("%y", year);
- int pageCount = getPageCount(ind);
- for (int i = 1; i < pageCount+1; i++) {
- String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");
- String path = year + "_" + i;
- start(url, array, DIR_PATH + path + "\\", path);
- }
- }
- try {
- FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");
- } catch (IOException e) {
- e.printStackTrace();
- }
- System.out.println(array);
- }
- public int getPageCount(String url) {
- int pageCount = 1;
- try {
- org.jsoup.nodes.Document doc = Jsoup.connect(url).get();
- String els = doc.html().toString();
- int start = els.indexOf("总页数") + 4;
- String temp = els.substring(start);
- int end = temp.indexOf(",");
- pageCount = Integer.parseInt(els.substring(start,start+end));
- System.out.println(pageCount);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return pageCount;
- }
- public void start(String url, JSONArray array, String dir, String path) {
- try {
- String content = super.doGet(url);
- Document doc = Jsoup.parse(content);
- Elements dds = doc.select(".img img");
- List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);
- for (int i = 0; i < dds.size(); i++) {
- Element img = dds.get(i);
- String src = img.select("img").first().attr("src");
- String title = img.select("img").first().attr("title");
- Map<String, Object> map = super.getMap();
- map.put("url", LOGO_URL + src);
- map.put("title", title);
- list.add(map);
- }
- JSONArray tempJsonArray = new JSONArray();
- for (Map<String, Object> map : list) {
- JSONObject jsonObject = new JSONObject();
- String proxy = StringUtils.substringAfterLast(map.get("url")
- .toString(), ".");
- long date = new Date().getTime();
- String name = date + "." + proxy;
- jsonObject.put("url", map.get("url").toString());
- jsonObject.put("dir", name);
- jsonObject.put("title", map.get("title").toString());
- // 翻译
- // String dateZh = super.translateEnToCinese(map.get("date")
- // .toString());
- // String titleZh = super.translateEnToCinese(map.get("title")
- // .toString());
- // json.put("title_zh_cn", dateZh + " - " + titleZh);
- // 下载图片
- super.downloadFile(map.get("url").toString(), dir + name);
- tempJsonArray.put(jsonObject);
- }
- array.put(new JSONObject().put(path, tempJsonArray));
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws Exception {
- new GoogleLogoCrawler().doStart();
- }
- }