Java 初级web爬虫(下载网页图片)
Java初级web爬虫(网页图片)
功能:指定网页的指定后缀文件 还可以筛选排除指定大小以下的图片
使用:执行main()方法后会将指定的网页的后缀为“.jpg”或者为“.png”的图片会下载到D盘的名为 图片 文件夹中。
没有的话创建一个文件夹(我就懒得建了哈)
代码里的常量根据自己的需求修改,一是修改网页地址,二是修改筛选排除文件大小(单位是字节哈)
爬虫很容易实现,但是要考虑的细节有很多。还会继续改进,以实现更多的功能。
ps:另外,本次代码没有使用其他组件,粘贴即可运行。
话不多说,上代码:
package com.jl.down;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.compile;
public class DownPic {
private static final String TARGET_URL= "https://www.haorenka.org/20191226.html";
public static final int FILTER = 1000000;
public static void main(String[] args) throws Exception {
long startTime = System.currentTimeMillis();
DownPic cm = new DownPic();
String html = cm.getHTML(TARGET_URL, "UTF-8");
List<String> imgSrc = getImgSrc(html);
cm.downLoad(imgSrc);
long endTime = System.currentTimeMillis();
System.out.println("本次下载共耗时" + (endTime - startTime) / 1000.00 + "秒");
}
private String getHTML(String url, String charset) throws Exception {
URL uri = new URL(url);
URLConnection connection = uri.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
StringBuilder sb = new StringBuilder();
String msg;
while ((msg = reader.readLine()) != null) {
sb.append(msg);
}
reader.close();
return sb.toString();
}
public static List<String> getImgSrc(String content) {
List<String> list = new ArrayList<String>();
Pattern p = compile("<(img|IMG)(.*?)(/>|></img>|>)");
Matcher mImg = p.matcher(content);
boolean resultImg = mImg.find();
if (resultImg) {
while (resultImg) {
String strImg = mImg.group(2);
Pattern pSrc = compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
Matcher mSrc = pSrc.matcher(strImg);
if (mSrc.find()) {
String strSrc = mSrc.group(3);
list.add(strSrc);
}
resultImg = mImg.find();
}
}
List<String> list2 = new ArrayList<>();
for (String s : list) {
Pattern pSrc = compile("(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]");
Matcher mSrc = pSrc.matcher(s);
if (mSrc.find()){
list2.add(mSrc.group());
}
}
return list2;
}
private void downLoad(List<String> listImgSrc) {
File src = new File("D:\\下载");
int count = 0;
BufferedOutputStream bos = null;
BufferedInputStream bis = null;
try {
for (int i = 0; i < listImgSrc.size(); i++) {
URL uri = new URL(listImgSrc.get(i));
HttpURLConnection urilen = (HttpURLConnection) uri.openConnection();
urilen.connect();
urilen.setConnectTimeout(20*1000);
urilen.setReadTimeout(20*1000);
int fileLength = urilen.getContentLength();
if (urilen.getResponseCode() >= 400) {
System.out.println("服务器响应错误");
System.exit(-1);
}
if (fileLength <= 0) {
System.out.println("无法获知文件大小");
System.out.println("无法下载...跳过...");
continue;
} else if (fileLength < FILTER) {
System.out.println("图片小于指定大小...");
System.out.println("不予下载...");
continue;
}
System.out.println("本次下载大小------>" + fileLength / 1024 + "K");
bis = new BufferedInputStream(urilen.getInputStream());
String temp = urilen.toString();
String filename = temp.substring(temp.lastIndexOf("/") + 1, temp.length());
System.out.println("------------------" + filename + "------------------");
Pattern p = compile("(image/jpeg)|(image/png)");
Matcher m = p.matcher(urilen.getContentType());
boolean flag = m.find();
if (flag) {
bos = new BufferedOutputStream(new FileOutputStream(src + "\\" + filename));
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + filename);
while ((length = bis.read(buf, 0, buf.length)) != -1) {
bos.write(buf, 0, length);
}
System.out.println(filename + "\t\t\t\t下载完成\r\n");
count++;
} else {
System.out.println("######不是指定资源...跳过本次下载######");
}
}
} catch (IOException e) {
System.out.println("异常,停止运行");
e.printStackTrace();
} finally {
if (bos != null) {
try {
bos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (bis != null) {
try {
bis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
System.out.println("此网页共获得图片连接"+listImgSrc.size()+"个");
System.out.println("不符合条件者"+(listImgSrc.size()-count)+"个");
System.out.println("本次共下载" + count + "张图片");
}
}