一个简单的爬取百度图片的程序
小白刚写代码,参考了很多网上资料和教程,尤其是解决百度URL加密问题的方法,放在这里和大家分享一下,本贴只用于学习交流。
主要用了简单的正则来获取百度图片的URL路径,有一些URL无法连接,所以直接跳过了,下载后的文件也都强行以.jpg命名,为了运行后好看点已加入了一些控制台输出
代码如下:
package com.jpg.get;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* 一个用来下载图片的类
* @author YINGRUO
*
*/
public class Loader {
private static byte[] bs = new byte[1024]; // 用来缓冲数据的字节数组
private static int index = 0; // 缓冲流的下标
private static float num = 0; // 数据统计标记
public static void main(String[] args) {
Loader loader = new Loader("高清壁纸", "C:\\Users\\D N\\Desktop\\1016",10);
loader.startdl();
}
private String search = "";
private String savePath = "";
private int pagenum = 0;
/**
*
* @param search 搜索名
* @param savePath 保存路径
* @param pagenum 下载张数
*/
public Loader(String search, String savePath, int pagenum) {
this.savePath = savePath;
this.search = search;
this.pagenum = pagenum;
}
public String getSearch() {
return search;
}
public void setSearch(String search) {
this.search = search;
}
public String getSavePath() {
return savePath;
}
public void setSavePath(String savePath) {
this.savePath = savePath;
}
public int getPagenum() {
return pagenum;
}
public void setPagenum(int pagenum) {
this.pagenum = pagenum;
}
@SuppressWarnings("deprecation")
private String changeserach() {
String searchname = search.replaceAll("\\s", "");
return URLEncoder.encode(searchname);
}
/**
* 根据保存路径和图片下载地址获取输出流
*
* @param pagepath
* @return
*/
private FileOutputStream getOutPut(String pagepath) {
String pagename = pagepath.split("/")[pagepath.split("/").length - 1];
pagename = pagename.split("\\?")[0];
try {
File file = new File(savePath);
if (!file.exists()) {
file.mkdirs();
}
return new FileOutputStream(new File(savePath + "\\" + pagename
+ ".jpg"));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return null;
}
/**
* 根据搜索名获取URL地址名
*
* @param searchName
* @param pagenum
* 需为0或30的整数倍
* @return
*/
private String getsearchURLName(String searchName, int pagenum) {
return "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord="
+ searchName
+ "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word="
+ searchName
+ "&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&"
+ "pn=" + pagenum + "&rn=30&gsm=1e&1550048512094=";
}
/**
* 根据URL获取xml信息
*
* @param url
* @return
*/
private String getXMLOfStringType(URL url) {
InputStream inputStream = null;
try {
inputStream = url.openStream();
StringBuffer buffer = new StringBuffer();
while ((index = inputStream.read(bs)) != -1) {
buffer.append(new String(bs, 0, index));
}
return new String(buffer.toString().getBytes(), "utf-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} finally {
try {
if (inputStream != null)
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* 解析String文件获得page的URL名
*
* @param xmlstring
* @return URL名的List集合(图片下载地址的集合)
*/
private List<String> getPageURL(String xmlstring) {
List<String> list = new ArrayList<String>();
String[] elements = xmlstring.split("\\s");
for (String string : elements) {
if (string.startsWith("\"objURL\"")) {
list.add(decode(string.split("\"")[3]));
}
}
return list;
}
/**
* 程序运行方法,调用后开始下载jpg图片
*/
public void startdl() {
OutputStream fileOutputStream = null;
InputStream inputStream = null;
List<String> pageList = null;
Iterator<String> iterator = null;
int pagenum_tmp = 0;// URL Request请求的参数 为0或30的整数倍
int pagenum_lod = 0;// 下载成功的数量标记
String string = "";
String pagename = "";
while (pagenum_lod < pagenum) {
try {
pageList = getPageURL(new String(getXMLOfStringType(new URL(
getsearchURLName(changeserach(), pagenum_tmp)))));
} catch (MalformedURLException e1) {
e1.printStackTrace();
}
iterator = pageList.iterator();
// 根据列表进行下载
while (iterator.hasNext()) {
string = iterator.next();
pagename = string.split("/")[string.split("/").length - 1]
.split("\\?")[0];
try {
System.out.println("开始下载:" + pagename + "\n正在连接:\n"
+ string);
inputStream = new URL(string).openStream();
fileOutputStream = getOutPut(string);
System.out.println("连接成功\n正在下载:" + pagename);
int i = 0;
float f = num;
while ((index = inputStream.read(bs)) != -1) {
i++;
if (i == 0)
System.out.print("█");
if (i % 50 == 0)
System.out.print("█");
num += index;
fileOutputStream.write(bs, 0, index);
}
// 下载成功后标记加一
pagenum_lod++;
System.out.println("\n大小:" + ((num - f) / 1024) + "KB\t"
+ "\n下载完成");
System.out
.println("*************************************************************");
} catch (IOException e) {
System.out.println("连接失败");
System.out
.println("*************************************************************");
continue;
} finally {
// 如果下载量达到阀值,跳出下载
if (pagenum_lod >= pagenum) {
break;
}
}
}
pagenum_tmp += 30;
}
// 输入输出流关闭操作
if (fileOutputStream != null) {
try {
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("共下载" + pagenum + "张图片\t大小:" + num / (1024) + "KB\t"
+ "保存路径:" + savePath);
}
private String decode(String url) {
String myUrl = "";
myUrl = url.replace("ippr", "http");
myUrl = myUrl.replace("_z2C$q", ":");
myUrl = myUrl.replace("AzdH3F", "/");
myUrl = myUrl.replace("_z&e3B", ".");
myUrl = myUrl.toLowerCase();
myUrl = myUrl.substring(4);
char[] arr = myUrl.toCharArray();
myUrl = "";
for (char c : arr) {
switch (c) {
case 'w':
myUrl += "a";
break;
case 'k':
myUrl += "b";
break;
case 'v':
myUrl += "c";
break;
case '1':
myUrl += "d";
break;
case 'j':
myUrl += "e";
break;
case 'u':
myUrl += "f";
break;
case '2':
myUrl += "g";
break;
case 'i':
myUrl += "h";
break;
case 't':
myUrl += "i";
break;
case '3':
myUrl += "j";
break;
case 'h':
myUrl += "k";
break;
case 's':
myUrl += "l";
break;
case '4':
myUrl += "m";
break;
case 'g':
myUrl += "n";
break;
case '5':
myUrl += "o";
break;
case 'r':
myUrl += "p";
break;
case 'q':
myUrl += "q";
break;
case '6':
myUrl += "r";
break;
case 'f':
myUrl += "s";
break;
case 'p':
myUrl += "t";
break;
case '7':
myUrl += "u";
break;
case 'e':
myUrl += "v";
break;
case 'o':
myUrl += "w";
break;
case '8':
myUrl += "1";
break;
case 'd':
myUrl += "2";
break;
case 'n':
myUrl += "3";
break;
case '9':
myUrl += "4";
break;
case 'c':
myUrl += "5";
break;
case 'm':
myUrl += "6";
break;
case '0':
myUrl += "7";
break;
case 'b':
myUrl += "8";
break;
case 'l':
myUrl += "9";
break;
case 'a':
myUrl += "0";
break;
default:
myUrl += c;
break;
}
}
return "http" + myUrl;
}
}