利用java抓取网页上的所有图片:
用两个正则表达式:
1、匹配html中img标签的正则:]*?>
2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)
实现:
- package
org.swinglife.main; -
- import
java.io.File; - import
java.io.FileOutputStream; - import
java.io.InputStream; - import
java.net.URL; - import
java.net.URLConnection; - import
java.util.ArrayList; - import
java.util.List; - import
java.util.regex.Matcher; - import
java.util.regex.Pattern; -
-
- public
class CatchImage { -
-
// 地址 -
private static final String URL = "http://www.csdn.net"; -
// 编码 -
private static final String ECODING = "UTF-8"; -
// 获取img标签正则 -
private static final String IMGURL_REG = "]*?>"; -
// 获取src路径的正则 -
private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)"; -
-
-
public static void main(String[] args) throws Exception { -
CatchImage cm = new CatchImage(); -
//获得html文本内容 -
String HTML = cm.getHTML(URL); -
//获取图片标签 -
List imgUrl = cm.getImageUrl(HTML); -
//获取图片src地址 -
List imgSrc = cm.getImageSrc(imgUrl); -
//下载图片 -
cm.Download(imgSrc); -
} -
-
-
-
private String getHTML(String url) throws Exception { -
URL uri = new URL(url); -
URLConnection connection = uri.openConnection(); -
InputStream in = connection.getInputStream(); -
byte[] buf = new byte[1024]; -
int length = 0; -
StringBuffer sb = new StringBuffer(); -
while ((length = in.read(buf, 0, buf.length)) > 0) { -
sb.append(new String(buf, ECODING)); -
} -
in.close(); -
return sb.toString(); -
} -
-
-
private List getImageUrl(String HTML) { -
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML); -
List listImgUrl = new ArrayList(); -
while (matcher.find()) { -
listImgUrl.add(matcher.group()); -
} -
return listImgUrl; -
} -
-
-
private List getImageSrc(List listImageUrl) { -
List listImgSrc = new ArrayList(); -
for (String image : listImageUrl) { -
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image); -
while (matcher.find()) { -
listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1)); -
} -
} -
return listImgSrc; -
} -
-
-
private void Download(List listImgSrc) { -
try { -
for (String url : listImgSrc) { -
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length()); -
URL uri = new URL(url); -
InputStream in = uri.openStream(); -
FileOutputStream fo = new FileOutputStream(new File(imageName)); -
byte[] buf = new byte[1024]; -
int length = 0; -
System.out.println("开始下载:" + url); -
while ((length = in.read(buf, 0, buf.length)) != -1) { -
fo.write(buf, 0, length); -
} -
in.close(); -
fo.close(); -
System.out.println(imageName + "下载完成"); -
} -
} catch (Exception e) { -
System.out.println("下载失败"); -
} -
} -
-
- }