1、抓取方式
一般来说,应该用jsoup来抓取网页中的Image的,但是,由于百度图片的加载不再是翻页形式的了,而是使用ajax的方式动态加载。所以,使用jsoup的话,也就只能拿到第一页的内容,想要获取后面页码的内容,效力不够。不过,既然是ajax的方式,我们自然也可以通过ajax的内容来获取到图片内容。
2、验证
可以看到响应hoverURL就是对应的图片地址,那么,我就可以直接从这个json串中,获取到图片地址进行下载了
3、实现
3.1、具体请求
请求示例:
String url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10660754132115598609&ipn=rj&ct=201326592&is=&fp=result&" +
"queryWord="+keyword+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&" +
"word="+keyword+"&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&" +
"pn="+page+"&rn=30&gsm=5a&1606053649620=";
说明:queryWord和word就是我们要查询的关键字,pn是30的整数倍,因为一页有30张图片,故pn=n*30
3.2、代码实现
package xyz.xfcloud.test.demo.service;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
@Service
public class MyService {
private RestTemplate restTemplate = new RestTemplate();
private String path = "D:\\pictures";
public void downImage(String keyword,int page){
String url = "https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10660754132115598609&ipn=rj&ct=201326592&is=&fp=result&" +
"queryWord="+keyword+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&hd=&latest=©right=&" +
"word="+keyword+"&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&expermode=&force=&" +
"pn="+page+"&rn=30&gsm=5a&1606053649620=";
String result = restTemplate.getForObject(url, String.class);
String tempPath = path;
if(!tempPath.endsWith("\\")){
tempPath = path+"\\";
}
tempPath = tempPath+keyword+"\\";
File f = new File(tempPath);
if(!f.exists()){
f.mkdirs();
}
JSONObject jsonObject = JSONObject.parseObject(result);
JSONArray array = jsonObject.getJSONArray("data");
for (int i=0;i<array.size();i++){
JSONObject o = array.getJSONObject(i);
try {
download(o.getString("hoverURL"),tempPath);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(o.getString("middleURL"));
}
System.out.println(result);
}
private void sop(Object obj){
System.out.println(obj);
}
private void download(String url,String path){
//path = path.substring(0,path.length()-2);
File file= null;
File dirFile=null;
FileOutputStream fos=null;
HttpURLConnection httpCon = null;
URLConnection con = null;
URL urlObj=null;
InputStream in =null;
byte[] size = new byte[1024];
int num=0;
try {
String downloadName= url.substring(url.lastIndexOf("/")+1);
dirFile = new File(path);
if(!dirFile.exists() && path.length()>0){
if(dirFile.mkdir()){
sop("creat document file \""+path.substring(0,path.length()-1)+"\" success...\n");
}
}else{
file = new File(path+downloadName);
fos = new FileOutputStream(file);
if(url.startsWith("http")){
urlObj = new URL(url);
con = urlObj.openConnection();
httpCon =(HttpURLConnection) con;
in = httpCon.getInputStream();
while((num=in.read(size)) != -1){
for(int i=0;i<num;i++)
{ fos.write(size[i]);}
}
}
}
}catch (FileNotFoundException notFoundE) {
sop("找不到该网络图片....");
}catch(NullPointerException nullPointerE){
sop("找不到该网络图片....");
}catch(IOException ioE){
sop("产生IO异常.....");
}catch (Exception e) {
e.printStackTrace();
}finally{
try {
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
4、源码下载地址