代码如下:
package cn.xyz.commons.utils;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.google.common.io.Files;
import net.coobird.thumbnailator.Thumbnails;
import net.coobird.thumbnailator.builders.BufferedImageBuilder;
import net.coobird.thumbnailator.resizers.Resizers;
/**
* 爬取百度图片
* @author llad
*
*/
public class JsoupBaiduImg {
private static Log sop = LogFactory.getLog(JsoupBaiduImg.class);
public static void main(String[] args) throws Exception {
String downloadPath = "D:/www/resources/robit/o";
String downloadMinPath = "D:/www/resources/robit/t";
// 多个类型,空格分开
List<String> list = nameList("卡通头像 唯美头像");
getPictures(list, 1, downloadPath, 640, 640, downloadMinPath, 1, 100, 100); // 1代表下载一页,一页设置的30张图片
}
/**
*
* @param keywordList 关键词 路径
* @param max 最多页码
* @param downloadPath 下载后存储路径
* @param bw 抓图宽度
* @param bh 抓图高度
* @param downloadMinPath 需压缩的路径
* @param type 0不压缩,1压缩
* @param w 指定压缩宽
* @param h 指定压缩高度
* @throws Exception
*/
public static void getPictures(List<String> keywordList, int max, String downloadPath, int bw, int bh,
String downloadMinPath, int type, int w, int h) throws Exception { // key为关键词,max作为爬取的页数
// String gsm = Integer.toHexString(max) + "";
String finalURL = "";
String tempPath = "";
for (String keyword : keywordList) {
tempPath = downloadPath;
if (!tempPath.endsWith("/")) {
tempPath = downloadPath + "/";
}
// tempPath = tempPath + keyword + "\\";
File f = new File(tempPath);
if (!f.exists()) {
f.mkdirs();
}
int picCount = 1;
for (int page = 0; page <= max; page++) {
sop.debug("正在下载第" + page + "页面");
Document document = null;
try {
String url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="
+ keyword + "&cg=star&pn=" + page * 30 + "&rn=30&itg=0&z=0&fr=&width=" + bw + "&height="
+ bh + "&lm=-1&ic=0&s=0&st=-1&gsm=" + Integer.toHexString(page * 30);
sop.debug(url);
document = Jsoup.connect(url).data("query", "Java")// 请求参数
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")// 设置urer-agent
// get();
.timeout(5000).get();
String xmlSource = document.toString();
xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);
sop.debug(xmlSource);
String reg = "objURL\":\"http://.+?\\.jpg";
Pattern pattern = Pattern.compile(reg);
Matcher m = pattern.matcher(xmlSource);
while (m.find()) {
finalURL = m.group().substring(9);
sop.debug(keyword + picCount++ + ":" + finalURL);
download(finalURL, tempPath);
sop.debug(" 下载成功");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
sop.debug("下载完毕");
delMultyFile(downloadPath);
sop.debug("已经删除所有空图");
if (type == 1) {
compressPicForWH(w, h, downloadPath, downloadMinPath);
sop.debug("压缩所有图片");
}
}
public static long delMultyFile(String path) {
File file = new File(path);
if (!file.exists())
throw new RuntimeException("File \"" + path + "\" NotFound when excute the method of delMultyFile()....");
File[] fileList = file.listFiles();
// File tempFile = null;
for (File f : fileList) {
if (f.isDirectory()) {
delMultyFile(f.getAbsolutePath());
} else {
if (f.length() <= 1024)
sop.debug(f.delete() + "---" + f.getName());
}
}
return new File(path).length();
}
public static List<String> nameList(String nameList) {
List<String> arr = new ArrayList<>();
String[] list;
if (nameList.contains(","))
list = nameList.split(",");
else if (nameList.contains("、"))
list = nameList.split("、");
else if (nameList.contains(" "))
list = nameList.split(" ");
else {
arr.add(nameList);
return arr;
}
for (String s : list) {
arr.add(s);
}
return arr;
}
// public static void sop.debug(Object obj) {
// System.out.println(obj);
// }
// 根据图片网络地址下载图片
public static void download(String url, String path) {
// path = path.substring(0,path.length()-2);
File file = null;
File dirFile = null;
FileOutputStream fos = null;
HttpURLConnection httpCon = null;
URLConnection con = null;
URL urlObj = null;
InputStream in = null;
byte[] size = new byte[1024];
int num = 0;
String downloadName = "";
try {
// downloadName = url.substring(url.lastIndexOf("/") + 1);
downloadName = "rt" + new Random().nextInt(1000) + "_" + url.substring(url.lastIndexOf("/") + 1);
dirFile = new File(path);
if (!dirFile.exists() && path.length() > 0) {
if (dirFile.mkdir()) {
sop.debug("creat document file \"" + path.substring(0, path.length() - 1) + "\" success...\n");
}
} else {
file = new File(path + downloadName);
fos = new FileOutputStream(file);
if (url.startsWith("http")) {
urlObj = new URL(url);
con = urlObj.openConnection();
httpCon = (HttpURLConnection) con;
in = httpCon.getInputStream();
while ((num = in.read(size)) != -1) {
for (int i = 0; i < num; i++)
fos.write(size[i]);
}
}
}
} catch (FileNotFoundException notFoundE) {
sop.debug("找不到该网络图片....");
} catch (NullPointerException nullPointerE) {
sop.debug("找不到该网络图片....");
} catch (IOException ioE) {
sop.debug("产生IO异常.....");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
fos.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* 根据指定大小压缩图片
*
* @param w 宽
* @param h 高
* @param src 原路径
* @param tsrc 压缩后路径
* @return 压缩质量后的图片字节数组
*/
public static void compressPicForWH(int w, int h, String src, String tsrc) {
try {
File file = new File(src);
if (!file.exists())
throw new RuntimeException(
"File \"" + src + "\" NotFound when excute the method of delMultyFile()....");
File ft = new File(tsrc);
if (!ft.exists()) {
ft.mkdirs();
}
File[] fileList = file.listFiles();
// File tempFile = null;
for (File f : fileList) {
if (f.isDirectory()) {
compressPicForWH(w, h, f.getAbsolutePath(), tsrc);
} else {
// 压缩文件
BufferedImage srcImg = ImageIO.read(f);
BufferedImage tarImg = new BufferedImageBuilder(w, h, BufferedImage.TYPE_3BYTE_BGR).build();
Resizers.BILINEAR.resize(srcImg, tarImg);
// 写压缩文件
ImageIO.write(tarImg, "jpg", new File(tsrc + "/" + f.getName()));
}
}
sop.debug(file.listFiles().length);
} catch (Exception e) {
sop.debug("【图片压缩】msg=图片压缩失败!" + e);
}
}
/**
* 根据指定大小压缩图片
*
* @param imageBytes 源图片字节数组
* @param desFileSize 指定图片大小,单位kb
* @param imageId 影像编号
* @return 压缩质量后的图片字节数组
*/
public static byte[] compressPicForScale(byte[] imageBytes, long desFileSize, String imageId) {
if (imageBytes == null || imageBytes.length <= 0 || imageBytes.length < desFileSize * 1024) {
return imageBytes;
}
long srcSize = imageBytes.length;
double accuracy = getAccuracy(srcSize / 1024);
try {
while (imageBytes.length > desFileSize * 1024) {
ByteArrayInputStream inputStream = new ByteArrayInputStream(imageBytes);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(imageBytes.length);
Thumbnails.of(inputStream).scale(accuracy).outputQuality(accuracy).toOutputStream(outputStream);
imageBytes = outputStream.toByteArray();
}
sop.debug("【图片压缩】imageId={} | 图片原大小={}kb | 压缩后大小={}kb" + imageId + srcSize / 1024 + imageBytes.length / 1024);
} catch (Exception e) {
sop.debug("【图片压缩】msg=图片压缩失败!" + e);
}
return imageBytes;
}
/**
* 自动调节精度(经验数值)
*
* @param size 源图片大小
* @return 图片压缩质量比
*/
private static double getAccuracy(long size) {
double accuracy;
if (size < 900) {
accuracy = 0.85;
} else if (size < 2047) {
accuracy = 0.6;
} else if (size < 3275) {
accuracy = 0.44;
} else {
accuracy = 0.4;
}
return accuracy;
}
}