[前言]
早几天需要图片做一个小测试,正好也看了一下正则表达式,这个小程序也就相当于是正则表达式的一个"小应用"吧,于是写了一个很基础的爬虫,从静态网页源代码里面通过正则表达式分析出图片链接,然后将图片下载到本地.
[正则表达式]
我在知乎上看到一篇关于正则表达式比较好的文章,在这里把链接贴出来,供大家参考学习.
https://www.zhihu.com/question/48219401/answer/742444326
[输入输出流]
输入输出流不太熟悉的朋友可以参考博客上的一篇关于IO流的文章.
https://blog.csdn.net/hguisu/article/details/7418161
[代码部分]
1.从一个网络连接里面获取网页的源代码,其中charset表示该网页的编码方式,一般是UTF-8或者是GBK;
//从一个网络链接里面获取源代码
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));
String temp = "";
while((temp = reader.readLine()) != null){
sb.append(temp);
}
}catch (MalformedURLException e){
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
2.对获取的源代码进行分析筛选
public static Set<String> getURLContent2(String WebURL,String regurx,String charset) throws IOException {
List<String> res = new ArrayList<>();
String destStr = getURLContent(WebURL,charset);
Pattern p = Pattern.compile(regurx);
Matcher m = p.matcher(destStr);
while(m.find()){
res.add(m.group(1));
}
Set<String> result = new HashSet<>();
for(String temp : res) {
Pattern pp = Pattern.compile("(.+?)\"");//通过正则二次过滤
Matcher mm = pp.matcher(temp);
if (mm.find()) {
result.add(mm.group(1));
}
}
return result;
}
3.从链接里面下载一张图片
public static void downloadJPG(String pngAdress,String storagePath) {
URL url = null;
int state = 0;
//从网络上下载一张图片
InputStream inputStream = null;
OutputStream outputStream = null;
//建立一个网络链接
HttpURLConnection con = null;
try {
url = new URL(pngAdress);
con = (HttpURLConnection) url.openConnection();
inputStream = con.getInputStream();
state = con.getResponseCode();
} catch (Exception e) {
System.out.println("URL不可用");
} finally {
if (state == 0 || state == 404) {
return;
}
}try {
outputStream = new FileOutputStream(new File(storagePath));
int n = -1;
byte b[] = new byte[1024];
while ((n = inputStream.read(b)) != -1) {
outputStream.write(b, 0, n);
}
outputStream.flush();
} catch (SocketException e) {
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
inputStream.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
outputStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
4.过滤下载,即只下载图片,并在此处填本地文件夹
public static void downloadJPG02(String website,String regurx,String ch,int i) {
int state = 0;
Set<String> result = null;
try {
result = getURLContent2(website, regurx, ch);
} catch (IOException e) {
e.printStackTrace();
}
Pattern p = Pattern.compile("[(.jpg)(.png)(.gif)]$");//通过正则表达式再次过滤
String storagrPath = "D:\\crawlerJPG02\\";//下载到本地文件夹
boolean tag = false;
for (String temp : result) {
Matcher m = p.matcher(temp);
if (m.find()) {
//System.out.println(temp);
downloadJPG(temp, storagrPath + "Imag" + i + ".jpg");
++i;
} else if (!tag) {
website = temp;
tag = true;
}
}
}
5.主函数
public static void main(String[] args) {
String website = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=";
String regurx = "\"(http[s]?://(.+?jpg)\")";
String ch = "UTF-8";
downloadJPG02(website+"熊猫", regurx, ch, 1);
}
6.全部代码
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//1.从一个网络连接里面获取网页的源代码,其中charset表示该网页的编码方式,一般是UTF-8或者是GBK;
public class Test {
//从一个网络链接里面获取源代码
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));
String temp = "";
while((temp = reader.readLine()) != null){
sb.append(temp);
}
}catch (MalformedURLException e){
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
//2.对获取的源代码进行分析筛选
public static Set<String> getURLContent2(String WebURL,String regurx,String charset) throws IOException {
List<String> res = new ArrayList<>();
String destStr = getURLContent(WebURL,charset);
Pattern p = Pattern.compile(regurx);
Matcher m = p.matcher(destStr);
while(m.find()){
res.add(m.group(1));
}
Set<String> result = new HashSet<>();
for(String temp : res) {
Pattern pp = Pattern.compile("(.+?)\"");//通过正则二次过滤
Matcher mm = pp.matcher(temp);
if (mm.find()) {
result.add(mm.group(1));
}
}
return result;
}
//3.从链接里面下载一张图片
public static void downloadJPG(String pngAdress,String storagePath) {
URL url = null;
int state = 0;
//从网络上下载一张图片
InputStream inputStream = null;
OutputStream outputStream = null;
//建立一个网络链接
HttpURLConnection con = null;
try {
url = new URL(pngAdress);
con = (HttpURLConnection) url.openConnection();
inputStream = con.getInputStream();
state = con.getResponseCode();
} catch (Exception e) {
System.out.println("URL不可用");
} finally {
if (state == 0 || state == 404) {
return;
}
}try {
outputStream = new FileOutputStream(new File(storagePath));
int n = -1;
byte b[] = new byte[1024];
while ((n = inputStream.read(b)) != -1) {
outputStream.write(b, 0, n);
}
outputStream.flush();
} catch (SocketException e) {
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
inputStream.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
outputStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//4.过滤下载,即只下载图片,并在此处填本地文件夹
public static void downloadJPG02(String website,String regurx,String ch,int i) {
int state = 0;
Set<String> result = null;
try {
result = getURLContent2(website, regurx, ch);
} catch (IOException e) {
e.printStackTrace();
}
Pattern p = Pattern.compile("[(.jpg)(.png)(.gif)]$");//通过正则表达式再次过滤
String storagrPath = "D:\\crawlerJPG02\\";//下载到本地文件夹
boolean tag = false;
for (String temp : result) {
Matcher m = p.matcher(temp);
if (m.find()) {
//System.out.println(temp);
downloadJPG(temp, storagrPath + "Imag" + i + ".jpg");
++i;
} else if (!tag) {
website = temp;
tag = true;
}
}
}
//5.主函数
public static void main(String[] args) {
String website = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=";
String regurx = "\"(http[s]?://(.+?jpg)\")";
String ch = "UTF-8";
downloadJPG02(website+"熊猫", regurx, ch, 1);
}
}
7.运行结果
[写在最后]
以上就是全部内容了,仅供大家学习参考,若有不对的地方,请留言或私信指出,谢谢大家.