今天朋友工作需要到百度上面拉取一些证件照,我就用java简单帮他实现了一个自动下载的功能。
没有去细细研究为什么会有那么多重复图片只是简单的用map做了下去重,拉个几千张应该是没问题了。
直接上代码吧,JDK用的1.8,get set 我删除了。
lyq_config.properties配置如下
#地址 注意链接中寻找到的翻页字段需要被替换成pageNum
url=https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E8%AF%81%E4%BB%B6%E7%85%A7&pn=pageNum&gsm=1000000000000000050&ct=&ic=0&lm=-1&width=0&height=0
#翻页数量
pageCount=50
#本地保存地址
downloadLocalPath=D://localpath/
#是否启用时间命名文件
ifTimetoName=true
#####如果需要扩展正则可以自己添加 这里只默认了百度中需要的
package com.lyq.capture.service;
import com.lyq.capture.param.CaptureParam;
/**
* core
*
* @author Administrator
*
*/
public class CaptureService {
private static String IMGStr_REG = "thumbURL.*?\",";
private static String IMGURL_REG = "http.*?.jpg";
private static final String configName = "lyq_config.properties";
private List<String> srcList;
private Map<String, String> checkMap;
private CaptureParam param;
private String bodyHtml;
void init() throws Exception {
checkMap = new HashMap<String, String>();
param = new CaptureParam();
URL path = Thread.currentThread().getContextClassLoader().getResource(configName);
Properties prop = new Properties();
prop.load(path.openStream());
param.setUrl(prop.getProperty("url"));
param.setDownloadLocalPath(prop.getProperty("downloadLocalPath"));
param.setPageCount(Integer.parseInt(prop.getProperty("pageCount")));
param.setIfTimetoName(Boolean.parseBoolean(prop.getProperty("ifTimetoName")));
if (prop.getProperty("regex1") != null) {
IMGURL_REG = prop.getProperty("ifForPage");
}
if (prop.getProperty("regex2") != null) {
IMGURL_REG = prop.getProperty("ifForPage");
}
}
public void execute() throws Exception {
init();
for(int i = 1; i<= param.getPageCount();i++) {
srcList = new ArrayList<String>();
System.out.println("begin execute index = " + i);
String url = param.getUrl().replaceAll("pageNum", Integer.toString(i*20));
loadBodyHtml(url);
RegexHtml();
Download();
}
}
// Download
private void Download() {
for (String url : srcList) {
if(checkMap.containsKey(url)) {
System.out.println(url);
continue;
}
checkMap.put(url, null);
try {
String suffix = url.substring(url.lastIndexOf("."), url.length());
String filename = url;
if (param.isIfTimetoName()) {
filename = new Date().getTime() + suffix;
} else {
filename = url.substring(url.lastIndexOf("/") + 1, url.length());
}
URL uri = new URL(url);
String cookie = "要发送的cookie";
URLConnection conn = uri.openConnection();
conn.setRequestProperty("Cookie", cookie);
conn.connect();
InputStream in = conn.getInputStream();
FileOutputStream fo = new FileOutputStream(new File(param.getDownloadLocalPath() + filename));// 文件输出流
byte[] buf = new byte[1024];
int length = 0;
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
// 关闭流
in.close();
fo.close();
} catch (Exception e) {
e.printStackTrace();
System.out.println("下载失败");
}
}
}
public void RegexHtml() {
Matcher matcherParent = Pattern.compile(IMGStr_REG).matcher(bodyHtml);
List<String> list = new ArrayList<String>();
while (matcherParent.find()) {
list.add(matcherParent.group());
}
for (int i = 0; i < list.size(); i++) {
Matcher matcherUrl = Pattern.compile(IMGURL_REG).matcher(list.get(i));
while (matcherUrl.find()) {
srcList.add(matcherUrl.group());
}
}
System.out.println("当页内容 总数 = "+ srcList.size());
}
public void loadBodyHtml(String sendUrl) {
String html = "";
BufferedReader read = null;
URL url;
try {
url = new URL(sendUrl);
URLConnection conn = url.openConnection();
conn.connect();
read = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = null;
while ((line = read.readLine()) != null) {
html += line;
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (read != null) {
try {
read.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
this.bodyHtml = html;
}
}