java简单实现百度图片下载

最新推荐文章于 2024-04-20 02:43:38 发布

tanglihui520

最新推荐文章于 2024-04-20 02:43:38 发布

阅读量670

点赞数

文章标签： java 自动化图片下载

本文链接：https://blog.csdn.net/u012970850/article/details/82317242

版权

今天朋友工作需要到百度上面拉取一些证件照，我就用java简单帮他实现了一个自动下载的功能。

没有去细细研究为什么会有那么多重复图片只是简单的用map做了下去重，拉个几千张应该是没问题了。

直接上代码吧，JDK用的1.8,get set 我删除了。

lyq_config.properties配置如下

#地址注意链接中寻找到的翻页字段需要被替换成pageNum
url=https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E8%AF%81%E4%BB%B6%E7%85%A7&pn=pageNum&gsm=1000000000000000050&ct=&ic=0&lm=-1&width=0&height=0
#翻页数量
pageCount=50
#本地保存地址
downloadLocalPath=D://localpath/
#是否启用时间命名文件
ifTimetoName=true
#####如果需要扩展正则可以自己添加这里只默认了百度中需要的

package com.lyq.capture.service;

import com.lyq.capture.param.CaptureParam;

/**
* core
*
* @author Administrator
*
*/
public class CaptureService {

private static String IMGStr_REG = "thumbURL.*?\",";

private static String IMGURL_REG = "http.*?.jpg";

private static final String configName = "lyq_config.properties";

   private List<String> srcList;

   private Map<String, String> checkMap;

private CaptureParam param;

private String bodyHtml;

   void init() throws Exception {
       checkMap = new HashMap<String, String>();
       param = new CaptureParam();
       URL path = Thread.currentThread().getContextClassLoader().getResource(configName);
       Properties prop = new Properties();
       prop.load(path.openStream());
       param.setUrl(prop.getProperty("url"));
       param.setDownloadLocalPath(prop.getProperty("downloadLocalPath"));
       param.setPageCount(Integer.parseInt(prop.getProperty("pageCount")));
       param.setIfTimetoName(Boolean.parseBoolean(prop.getProperty("ifTimetoName")));
       if (prop.getProperty("regex1") != null) {
           IMGURL_REG = prop.getProperty("ifForPage");
       }
       if (prop.getProperty("regex2") != null) {
           IMGURL_REG = prop.getProperty("ifForPage");
       }
   }

   public void execute() throws Exception {
       init();
       for(int i = 1; i<= param.getPageCount();i++) {
           srcList = new ArrayList<String>();
           System.out.println("begin execute index = " + i);
           String url = param.getUrl().replaceAll("pageNum", Integer.toString(i*20));
           loadBodyHtml(url);
           RegexHtml();
           Download();
       }

   }

   // Download
   private void Download() {
       for (String url : srcList) {
           if(checkMap.containsKey(url)) {
               System.out.println(url);
               continue;
           }
           checkMap.put(url, null);
           try {
               String suffix = url.substring(url.lastIndexOf("."), url.length());
               String filename = url;
               if (param.isIfTimetoName()) {
                   filename = new Date().getTime() + suffix;
               } else {
                   filename = url.substring(url.lastIndexOf("/") + 1, url.length());
               }
               URL uri = new URL(url);
               String cookie = "要发送的cookie";
               URLConnection conn = uri.openConnection();
               conn.setRequestProperty("Cookie", cookie);
               conn.connect();
               InputStream in = conn.getInputStream();
               FileOutputStream fo = new FileOutputStream(new File(param.getDownloadLocalPath() + filename));// 文件输出流
               byte[] buf = new byte[1024];
               int length = 0;
               while ((length = in.read(buf, 0, buf.length)) != -1) {
                   fo.write(buf, 0, length);
               }
               // 关闭流
               in.close();
               fo.close();
           } catch (Exception e) {
               e.printStackTrace();
               System.out.println("下载失败");
           }
       }
   }

   public void RegexHtml() {
       Matcher matcherParent = Pattern.compile(IMGStr_REG).matcher(bodyHtml);
       List<String> list = new ArrayList<String>();
       while (matcherParent.find()) {
           list.add(matcherParent.group());
       }
       for (int i = 0; i < list.size(); i++) {
           Matcher matcherUrl = Pattern.compile(IMGURL_REG).matcher(list.get(i));
           while (matcherUrl.find()) {
               srcList.add(matcherUrl.group());
           }
       }
       System.out.println("当页内容总数 = "+ srcList.size());
   }

   public void loadBodyHtml(String sendUrl) {
       String html = "";
       BufferedReader read = null;
       URL url;
       try {
           url = new URL(sendUrl);
           URLConnection conn = url.openConnection();
           conn.connect();
           read = new BufferedReader(new InputStreamReader(conn.getInputStream()));
           String line = null;
           while ((line = read.readLine()) != null) {
               html += line;
           }
       } catch (Exception e) {
           e.printStackTrace();
       } finally {
           if (read != null) {
               try {
                   read.close();
               } catch (IOException e) {
                   e.printStackTrace();
               }
           }
       }
       this.bodyHtml = html;
   }