昨天突然想搞下抓取网上的图片所以写了下
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Image {
public static void main(String args[]) throws SocketException {
String str = GetUrl("http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E9%AB%98%E5%9C%86%E5%9C%86");
List<String> ouput = GetMatcher(str, "src=\"([\\w\\s./:]+?)\"");
for (String temp : ouput) {
System.out.println(temp);
}
System.out.println("....................");
for (int i = 0; i < ouput.size(); i++) {
String aurl = ouput.get(i);
URL url;
try {
url = new URL(aurl);
// 打开URL连接
URLConnection con = (URLConnection) url.openConnection();
// 得到URL的输入流
InputStream input = con.getInputStream();
if (input.available() > 0) {
// 设置数据缓冲
byte[] bs = new byte[1024 * 2];
// 读取到的数据长度
int len;
// 输出的文件流保存图片至本
String[] a = aurl.split("\\/");
String name = a[a.length - 1];
if (name.contains(".png") || name.contains(".jpeg") || name.contains(".jpg")|| name.contains(".gif")|| name.contains(".bmp")) {
String dir = "E:\\Image\\gaoyuanyuan";
File file = new File(dir, name);
OutputStream os = new FileOutputStream(file);
while ((len = input.read(bs)) != -1) {
os.write(bs, 0, len);
}
os.close();
input.close();}
} else if (input.available() == 0) {
System.out.println("与服务器的链接已中断");
break;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("————————————————————————单张抓取完毕——————————————————————————");
}
System.out.println("————————————————————————全部抓取完毕——————————————————————————");
}
public static String GetUrl(String inUrl) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(inUrl);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String temp = "";
while ((temp = reader.readLine()) != null) {
// System.out.println(temp);
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
return sb.toString();
}
public static List<String> GetMatcher(String str, String url) {
List<String> result = new ArrayList<String>();
Pattern p = Pattern.compile(url);// 获取网页地址
Matcher m = p.matcher(str);
while (m.find()) {
// System.out.println(m.group(1));
result.add(m.group(1));
}
return result;
}
}
这个url自己定义,本地存储地址也是自己定义,上面的url是百度的,但是抓不到几张,原因还在研究中。这个是一次抓取所有的png,jpeg,jpg类型的图片。