java简单web爬虫(网页图片)
效果,执行main()方法后图片就下载道C盘的res文件夹中。没有的话创建一个文件夹
代码里的常量根据自己的需求修改,代码附到下面。
package com.sinitek.sirm.common.utils;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* java简单web爬虫(网页图片)
*/
public class Main {
// 地址
private static final String URL = "http://www.xxx";
// 获取img标签正则
private static final String IMGURL_REG = "]*?>";
// 获取src路径的正则
private static final String IMGSRC_REG = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";
//图片原始路径(如果src里的路径正确则不用)
private static final String IMG_LUJING = "http://xxx/";
//下载路径
private static final String LUJING = "C:/res/";
public static void main(String[] args) {
try {
Main cm=new Main();
//获得html文本内容
String HTML = cm.getHtml(URL);
//获取图片标签
List imgUrl = cm.getImageUrl(HTML);
//获取图片src地址
List imgSrc = cm.getImageSrc(imgUrl);
//下载图片
cm.Download(imgSrc);
}catch (Exception e){
System.out.println("发生错误");
}
}
//获取HTML内容
private String getHtml(String url)throws Exception{
URL url1=new URL(url);
URLConnection connection=url1.openConnection();
InputStream in=connection.getInputStream();
InputStreamReader isr=new InputStreamReader(in);
BufferedReader br=new BufferedReader(isr);
String line;
StringBuffer sb=new StringBuffer();
while((line=br.readLine())!=null){
sb.append(line,0,line.length());
sb.append('\n');
}
br.close();
isr.close();
in.close();
return sb.toString();
}
//获取ImageUrl地址
private List getImageUrl(String html){
Matcher matcher=Pattern.compile(IMGURL_REG).matcher(html);
Listlistimgurl=new ArrayList();
while (matcher.find()){
listimgurl.add(matcher.group());
}
return listimgurl;
}
//获取ImageSrc地址
private List getImageSrc(List listimageurl){
List listImageSrc=new ArrayList();
for (String image:listimageurl){
// 匹配中的src数据
Matcher m = Pattern.compile(IMGSRC_REG).matcher(image);
while (m.find()) {
String a = m.group(1);//获取图片路径
a = IMG_LUJING+a;//数据拼接
listImageSrc.add(a);
}
}
return listImageSrc;
}
//下载图片
private void Download(List listImgSrc) {
try {
//开始时间
Date begindate = new Date();
for (String url : listImgSrc) {
//开始时间
Date begindate2 = new Date();
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File(LUJING+imageName));//路径
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println(imageName + "下载完成");
//结束时间
Date overdate2 = new Date();
double time = overdate2.getTime() - begindate2.getTime();
System.out.println("耗时:" + time / 1000 + "s");
}
Date overdate = new Date();
double time = overdate.getTime() - begindate.getTime();
System.out.println("总耗时:" + time / 1000 + "s");
} catch (Exception e) {
System.out.println("下载失败");
}
}
}