实现了一个简单的爬虫
一、功能
爬取壁纸图片(要求大于1M)
二、待完善:
1. 关键字爬取 (按壁纸类型关键字爬取)
2. 网址筛选(剔除收集到的未爬但无用网址)
3. list没有进行很好的处理,叠加扩容未处理
三、结语
新手上路,请多关照!(手动滑稽)!
maven相关依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
下面是源代码:
import org.jsoup.Jsoup;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author guohao
* @Description
* @Date 2021/10/12
*/
public class Robot {
//起始爬取网址
private static String startUrl="http://www.netbian.com/desk/18321.htm";
//爬取结果保存位置
private static String saveUrl="D:\\pachong\\";
//匹配图片的正则
private static String imgRegex="<img src=\"([h][t][t][p][:]){0,}[A-Za-z0-9/.]+[.][ji][pm][g]\"";
//匹配路径的正则
private static String urlRegex="<a href=\"([h][t][t][p][s]?[:][/]+)?[A-Za-z0-9/.?=-]+\"";
//已经爬虫过的网址list
private static List<String> reptiledList=new ArrayList<>();
//收集到的爬虫网址list
private static List<String> reptileList=new ArrayList<>();
public static void main(String[] args) {
System.out.println("开始爬虫!");
robot(startUrl);
}
public static void robot(String url){
if (reptiledList.contains(url)){
return;
}
System.out.println("爬取地址:"+url);
reptiledList.add(url);
List<String> resultList = getResultList(url);
for (String result : resultList) {
//过滤小于1M的文件
if (fileLengthOut1M(result)){
//下载
downloadImg(result,saveUrl);
}
// //不过滤图片大小
// downloadImg(result,saveUrl);
}
List<String> nextUrlList = getNextUrlList(url);
reptileList.addAll(nextUrlList);
for (String nextUrl : reptileList) {
robot(nextUrl);
}
}
/**
* 根据域名爬取 resultList
* 输入 String url
* 输出 List<String> resultList
* */
public static List<String> getResultList(String url){
List<String> list = new ArrayList<>();
String html=getHtml(url);
Pattern pattern = Pattern.compile(imgRegex);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String group=matcher.group();
if (!group.contains("http")){
group=url+"/"+group;
}
String result = group.replace("<img src=", "").replace("\"", "");
list.add(result);
}
return list;
}
/**
* 根据域名爬取相关域名List
* 输入 String 域名
* 输出 List<String> nextUrlList
* */
public static List<String> getNextUrlList(String url){
List<String> list = new ArrayList<>();
String html=getHtml(url);
Pattern pattern = Pattern.compile(urlRegex);
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String group=matcher.group();
if (!group.contains("http")){
group=url+"/"+group;
}
String result =group.replace("<a href=", "").replace("\"", "");
list.add(result);
}
return list;
}
/**
*下载文件到本地
* 输入 :String resultUrl
* String localPath
* 输出 打印 下载失败 或 下载成功
* */
public static void downloadImg(String resultUrl, String localPath){
URL newUrl = null;
HttpURLConnection hconnection = null;
InputStream inputStream = null;
FileOutputStream fileOutputStream = null;
byte[] bs = null;
try {
System.out.println("开始准备下载!");
newUrl = new URL(resultUrl);
hconnection = (HttpURLConnection) newUrl.openConnection(); //打开连接
inputStream = hconnection.getInputStream(); //获取流
bs = getBytesFromInputStream(inputStream); //流转btye[]
String outPutPath = localPath + resultUrl.substring(resultUrl.lastIndexOf("/")+1); //获取图片名称
System.out.println("图片路径:"+outPutPath);
fileOutputStream = new FileOutputStream(new File(outPutPath));
fileOutputStream.write(bs); //写出
System.out.println("下载成功!");
} catch (Exception e) {
System.out.println("下载失败!");
} finally {
System.out.println("===============================================================");
try {
inputStream.close();
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 输入流转byte数组
* */
public static byte[] getBytesFromInputStream(InputStream inputStream){
byte[] bs = null;
try {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream arrayOutputStream = new ByteArrayOutputStream(); //
while((len = inputStream.read(buffer)) != -1){
arrayOutputStream.write(buffer, 0 ,len);
}
bs = arrayOutputStream.toByteArray();
} catch (IOException e) {
e.printStackTrace();
}
return bs;
}
/**
* 根据路径拿到html页面内容
* */
public static String getHtml(String url){
String html = "";
try {
html = Jsoup.connect(url).execute().body();
} catch (IOException e) {
}
return html;
}
/**
* 判断网络文件是否大于1M
* */
private static boolean fileLengthOut1M(String downloadUrl){
URL url = null;
HttpURLConnection conn = null;
try {
url = new URL(downloadUrl);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("HEAD");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows 7; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 YNoteCef/5.8.0.1 (Windows)");
long lo= (long) conn.getContentLength();
int i=(int)lo/1024/1024;
if (i>0){
return true;
}
} catch (IOException e) {
System.out.println("获取文件大小失败!");
return false;
} finally {
conn.disconnect();
}
return false;
}
}