java实现简单爬虫

实现了一个简单的爬虫

一、功能

爬取壁纸图片(要求大于1M)

二、待完善:

1. 关键字爬取 (按壁纸类型关键字爬取)

2. 网址筛选(剔除收集到的未爬但无用网址)

3. list没有进行很好的处理,叠加扩容未处理

三、结语

新手上路,请多关照!(手动滑稽)!

maven相关依赖

  <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.13.1</version>
  </dependency>

下面是源代码:

import org.jsoup.Jsoup;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author guohao
 * @Description
 * @Date 2021/10/12
 */
public class Robot {
    //起始爬取网址
    private static String startUrl="http://www.netbian.com/desk/18321.htm";
    //爬取结果保存位置
    private static String saveUrl="D:\\pachong\\";
    //匹配图片的正则
    private static String imgRegex="<img src=\"([h][t][t][p][:]){0,}[A-Za-z0-9/.]+[.][ji][pm][g]\"";
    //匹配路径的正则
    private static String urlRegex="<a href=\"([h][t][t][p][s]?[:][/]+)?[A-Za-z0-9/.?=-]+\"";
    //已经爬虫过的网址list
    private static List<String> reptiledList=new ArrayList<>();
    //收集到的爬虫网址list
    private static List<String> reptileList=new ArrayList<>();
    public static void main(String[] args) {
        System.out.println("开始爬虫!");
        robot(startUrl);
    }
    public static void robot(String url){
        if (reptiledList.contains(url)){
            return;
        }
        System.out.println("爬取地址:"+url);
        reptiledList.add(url);
        List<String> resultList = getResultList(url);
        for (String result : resultList) {
            //过滤小于1M的文件
            if (fileLengthOut1M(result)){
                //下载
                downloadImg(result,saveUrl);
            }
//            //不过滤图片大小
//            downloadImg(result,saveUrl);
        }
        List<String> nextUrlList = getNextUrlList(url);
        reptileList.addAll(nextUrlList);
        for (String nextUrl : reptileList) {
            robot(nextUrl);
        }
    }

    /**
     * 根据域名爬取 resultList
     * 输入   String url
     * 输出   List<String> resultList
     * */
    public static List<String> getResultList(String url){
        List<String> list = new ArrayList<>();
        String html=getHtml(url);
        Pattern pattern = Pattern.compile(imgRegex);
        Matcher matcher = pattern.matcher(html);
        while (matcher.find()) {
            String  group=matcher.group();
            if (!group.contains("http")){
                group=url+"/"+group;
            }
            String result = group.replace("<img src=", "").replace("\"", "");
            list.add(result);
        }
        return list;
    }

    /**
     * 根据域名爬取相关域名List
     * 输入 String 域名
     * 输出 List<String> nextUrlList
     * */
    public static List<String> getNextUrlList(String url){
        List<String> list = new ArrayList<>();
        String html=getHtml(url);
        Pattern pattern = Pattern.compile(urlRegex);
        Matcher matcher = pattern.matcher(html);
        while (matcher.find()) {
            String  group=matcher.group();
            if (!group.contains("http")){
                group=url+"/"+group;
            }
            String result =group.replace("<a href=", "").replace("\"", "");
            list.add(result);
        }
        return list;
    }

    /**
     *下载文件到本地
     * 输入 :String resultUrl
     *        String localPath
     * 输出  打印    下载失败 或   下载成功
     * */
    public static void downloadImg(String resultUrl, String localPath){
        URL newUrl = null;
        HttpURLConnection hconnection = null;
        InputStream inputStream = null;
        FileOutputStream fileOutputStream = null;
        byte[] bs = null;
        try {
                System.out.println("开始准备下载!");
                newUrl = new URL(resultUrl);
                hconnection = (HttpURLConnection) newUrl.openConnection(); //打开连接
                inputStream = hconnection.getInputStream();  //获取流
                bs = getBytesFromInputStream(inputStream); //流转btye[]
                String outPutPath = localPath + resultUrl.substring(resultUrl.lastIndexOf("/")+1); //获取图片名称
                System.out.println("图片路径:"+outPutPath);
                fileOutputStream = new FileOutputStream(new File(outPutPath));
                fileOutputStream.write(bs); //写出
                System.out.println("下载成功!");

        } catch (Exception e) {
            System.out.println("下载失败!");
        } finally {
            System.out.println("===============================================================");
            try {
                inputStream.close();
                fileOutputStream.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 输入流转byte数组
     * */
    public static byte[] getBytesFromInputStream(InputStream inputStream){
        byte[] bs = null;
        try {
            byte[] buffer = new byte[1024];
            int len = 0;
            ByteArrayOutputStream arrayOutputStream = new ByteArrayOutputStream(); //
            while((len = inputStream.read(buffer)) != -1){
                arrayOutputStream.write(buffer, 0 ,len);
            }
            bs = arrayOutputStream.toByteArray();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return bs;
    }

    /**
     * 根据路径拿到html页面内容
     * */
    public static String getHtml(String url){
        String html = "";
        try {
            html = Jsoup.connect(url).execute().body();
        } catch (IOException e) {
        }
        return html;
    }

    /**
     * 判断网络文件是否大于1M
     * */
    private static boolean fileLengthOut1M(String downloadUrl){
        URL url = null;
        HttpURLConnection conn = null;
        try {
            url = new URL(downloadUrl);
            conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("HEAD");
            conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows 7; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 YNoteCef/5.8.0.1 (Windows)");
            long lo= (long) conn.getContentLength();
            int i=(int)lo/1024/1024;
            if (i>0){
                return true;
            }
        } catch (IOException e) {
            System.out.println("获取文件大小失败!");
            return false;
        } finally {
            conn.disconnect();
        }
        return false;
    }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值