1.1 环境及技术
- java8
- jsoup
- 文件流
1.2 分析访问路径
总体上说没有什么水平, 但是最耗时的是需要去反复试验获取图片的链接参数的作用。
以下面的链接为例:
https://cn.bing.com/images/async?q=%E5%B9%BB%E6%83%B3%E4%B9%A1&first=277&count=35&relp=35&scenario=ImageBasicHover&datsrc=N_I&layout=RowBased&mmasync=1&dgState=x*0_y*0_h*0_c*6_i*246_r*46&IG=AEDE9443E91045D58E57983441FB36E3&SFX=8&iid=images.5754
这里面关键的参数是q,first.
q是搜索的关键词;
first是指从第几张图片开始加载;
mmasync=1这个参数和值是固定的, 否则会有图片无法加载。
在程序中最关键的就是这条链接了。
1.3 程序
package com.nikolazhang.spider;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.Iterator;
import java.util.Scanner;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.nikolazhang.util.HttpRequestUtil;
public class SpiderPicFromBing {
private final static String[] strs = {"src", "data-src"};
public static void main(String[] args) throws IOException {
Scanner scanner = new Scanner(System.in);
System.out.println("=================================================");
System.out.println("= catch pictures from Bing =");
System.out.println("= 【2019年1月20日 下午1:08:47 NikolaZhang】 =");
System.out.println("=================================================");
System.out.println("请输入参数: [文件路径] [关键词] [下载数量]后回车");
System.out.println("参数使用空格分割, 如: F:\\火影\\ 火影忍者 200");
String[] params = scanner.nextLine().split(" ");
System.out.println("开始下载----------");
System.out.println(params[0]);
System.out.println(params[1]);
System.out.println(params[2]);
visitHtml(params);
}
/**
* 访问bing html界面, 获取界面http链接, 过滤, 下载图片
* @param params 输入关键词
* @param filepath 本地存放路径
*/
private static void visitHtml(String[] args) {
int count = Integer.valueOf(args[2]);
int start = 1;
for(int i = 0; i<count; i++) {
int res = downloadPic(args, start);
if (res != -1) {
start += res + 2;
i+=res;
} else {
System.out.println("***********下载出错!程序退出!");
}
}
}
private static int downloadPic(String[] args, int start) {
String filepath = args[0];
String params =args[1];
String url = "https://cn.bing.com/images/async?q="+params+"&first="+start+"&mmasync=1";
Connection conn = Jsoup.connect(url);
int i = 0;
try {
Document doc = conn.get();
Elements imgTags = doc.getElementsByTag("img");
Iterator<Element> elems = imgTags.iterator();
File file = new File(filepath);
if(!file.exists()) {
file.mkdirs();
}
while(elems.hasNext()) {
Element img = elems.next();
String attr = getImgUrl(img);
if(!"".equals(attr) && attr.startsWith("https")) {
System.out.println("获取图片: "+attr);
InputStream requestIO = HttpRequestUtil.httpRequestIO(attr);
long date = new Date().getTime();
String localpath = filepath + date+".png";
saveImageToDisk(requestIO, localpath);
i++;
}
}
System.out.println("==== INFORMATION =========================");
System.out.println("下载路径: " + url);
System.out.println("存储路径: " + filepath);
System.out.println("获取资源: " + params);
System.out.println("获取图片数量: " + i);
System.out.println("===== END ========================");
} catch (IOException e) {
e.printStackTrace();
return -1;
}
return i;
}
private static String getImgUrl(Element img) {
String attr = "";
for(String str : strs) {
attr = img.attr(str);
if(attr != null && !"".equals(attr) && attr.indexOf("&")!=-1) {
attr = attr.split("&")[0];
return attr;
}
}
return "";
}
private static void saveImageToDisk(InputStream inputStream, String filepath) {
byte[] data = new byte[1024];
int len = 0;
FileOutputStream fileOutputStream = null;
try {
fileOutputStream = new FileOutputStream(filepath);
while ((len = inputStream.read(data)) != -1) {
fileOutputStream.write(data, 0, len);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fileOutputStream != null) {
try {
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
2 打包
为了方便以后使用可以将程序打成可执行jar包
使用方法如下:
end
程序获取:
https://github.com/NikolaZhang/PickPic
先来500张日常。啊~ 爽啊~