JAVA抓取图片的网站上的jpg头像(图片),下载到本地目录
修改正则表达式可以获取你想要的网络资源,如图片,网址等。也可以抓取到网址后继续抓取相关联的网络资源
public class Robot {
public static void main(String[] args) {
URL url = null;
URLConnection urlconn = null;
BufferedReader br = null;
PrintWriter pw = null;
// String regex = "http://[\\w+\\.?/?]+\\.[A-Za-z]+"; 网站地址
//url匹配规则
String regex = "//[\\w+\\.?/?]+\\.jpg+";
List<String> urls=new ArrayList<>();
Pattern p = Pattern.compile(regex);
try {
url = new URL("https://www.baihe.com/");
//爬取的网址、
urlconn = url.openConnection();
pw = new PrintWriter(new FileWriter("D:/SiteURL.txt"), true);
//将爬取到的链接放到D盘的SiteURL文件中
br = new BufferedReader(new InputStreamReader(
urlconn.getInputStream()));
String buf = null;
while ((buf = br.readLine()) != null) {
Matcher buf_m = p.matcher(buf);
while (buf_m.find()) {
pw.println(buf_m.group());
//获取图片地址信息
urls.add("https:"+buf_m.group());
}
}
System.out.println("爬取成功^_^");
//下载图片到本地
for (String i:urls) {
//UUID
UUID uuid = UUID.randomUUID();
String extensionName = i.substring(i.lastIndexOf(".") + 1);
String newFileName = uuid+ "." + extensionName;
download(i,newFileName,"D:\\images");
}
} catch ( MalformedURLException e) {
e.printStackTrace();
} catch ( IOException e) {
e.printStackTrace();
} catch ( Exception e ) {
e.printStackTrace();
} finally {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
pw.close();
}
}
/**
* 根据地址下载图片
* @param urlString
* @param filename
* @param savePath
* @throws Exception
*/
public static void download(String urlString, String filename,String savePath) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
//设置请求超时为5s
con.setConnectTimeout(5*1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf=new File(savePath);
if(!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
}