第一个爬虫小程序——java爬虫从静态网页爬取图片(一)

最新推荐文章于 2024-06-25 21:28:44 发布

Dadzhu

最新推荐文章于 2024-06-25 21:28:44 发布

阅读量513

点赞数 2

分类专栏： jav爬虫文章标签： java

本文链接：https://blog.csdn.net/i_wavelet/article/details/107874598

版权

jav爬虫专栏收录该内容

1 篇文章 1 订阅

订阅专栏

[前言]

早几天需要图片做一个小测试,正好也看了一下正则表达式,这个小程序也就相当于是正则表达式的一个"小应用"吧,于是写了一个很基础的爬虫,从静态网页源代码里面通过正则表达式分析出图片链接,然后将图片下载到本地.

[正则表达式]

我在知乎上看到一篇关于正则表达式比较好的文章,在这里把链接贴出来,供大家参考学习.

https://www.zhihu.com/question/48219401/answer/742444326

[输入输出流]

输入输出流不太熟悉的朋友可以参考博客上的一篇关于IO流的文章.

https://blog.csdn.net/hguisu/article/details/7418161

[代码部分]

1.从一个网络连接里面获取网页的源代码,其中charset表示该网页的编码方式,一般是UTF-8或者是GBK;

 //从一个网络链接里面获取源代码
    public static String getURLContent(String urlStr,String charset) {
        StringBuilder sb = new StringBuilder();
        try {
            URL url = new URL(urlStr);
            BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));

            String temp = "";
            while((temp = reader.readLine()) != null){
                sb.append(temp);
            }
        }catch (MalformedURLException e){
            e.printStackTrace();
        }catch (IOException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

2.对获取的源代码进行分析筛选

public static Set<String> getURLContent2(String WebURL,String regurx,String charset) throws IOException {
        List<String> res = new ArrayList<>();
        String destStr = getURLContent(WebURL,charset);
        Pattern p = Pattern.compile(regurx);
        Matcher m = p.matcher(destStr);
        while(m.find()){
            res.add(m.group(1));
        }
        Set<String> result = new HashSet<>();
        for(String temp : res) {
            Pattern pp = Pattern.compile("(.+?)\"");//通过正则二次过滤
            Matcher mm = pp.matcher(temp);
            if (mm.find()) {
                result.add(mm.group(1));
            }
        }
        return result;
    }

3.从链接里面下载一张图片

public static void downloadJPG(String pngAdress,String storagePath) {
        URL url = null;
        int state = 0;
        //从网络上下载一张图片
        InputStream inputStream = null;
        OutputStream outputStream = null;
        //建立一个网络链接
        HttpURLConnection con = null;
        try {
            url = new URL(pngAdress);
            con = (HttpURLConnection) url.openConnection();
            inputStream = con.getInputStream();
            state = con.getResponseCode();
        } catch (Exception e) {
            System.out.println("URL不可用");
        } finally {
            if (state == 0 || state == 404) {
                return;
            }
        }try {
            outputStream = new FileOutputStream(new File(storagePath));
            int n = -1;
            byte b[] = new byte[1024];
            while ((n = inputStream.read(b)) != -1) {
                outputStream.write(b, 0, n);
            }
            outputStream.flush();
        } catch (SocketException e) {
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            try {
                inputStream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            try {
                outputStream.close();

            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

4.过滤下载,即只下载图片,并在此处填本地文件夹

    public static void downloadJPG02(String website,String regurx,String ch,int i) {
        int state = 0;
        Set<String> result = null;
        try {
            result = getURLContent2(website, regurx, ch);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Pattern p = Pattern.compile("[(.jpg)(.png)（.gif)]$");//通过正则表达式再次过滤
        String storagrPath = "D:\\crawlerJPG02\\";//下载到本地文件夹
        boolean tag = false;
        for (String temp : result) {
            Matcher m = p.matcher(temp);
            if (m.find()) {
                //System.out.println(temp);
                downloadJPG(temp, storagrPath + "Imag" + i + ".jpg");
                ++i;
            } else if (!tag) {
                website = temp;
                tag = true;
            }
        }
    }

5.主函数

 public static void main(String[] args) {
        String website = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=";
        String regurx = "\"(http[s]?://(.+?jpg)\")";
        String ch = "UTF-8";
        downloadJPG02(website+"熊猫", regurx, ch, 1);
    }

6.全部代码

import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


//1.从一个网络连接里面获取网页的源代码,其中charset表示该网页的编码方式,一般是UTF-8或者是GBK;
public class Test {
    //从一个网络链接里面获取源代码
    public static String getURLContent(String urlStr,String charset) {
        StringBuilder sb = new StringBuilder();
        try {
            URL url = new URL(urlStr);
            BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName(charset)));

            String temp = "";
            while((temp = reader.readLine()) != null){
                sb.append(temp);
            }
        }catch (MalformedURLException e){
            e.printStackTrace();
        }catch (IOException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }

//2.对获取的源代码进行分析筛选
    public static Set<String> getURLContent2(String WebURL,String regurx,String charset) throws IOException {
        List<String> res = new ArrayList<>();
        String destStr = getURLContent(WebURL,charset);
        Pattern p = Pattern.compile(regurx);
        Matcher m = p.matcher(destStr);
        while(m.find()){
            res.add(m.group(1));
        }
        Set<String> result = new HashSet<>();
        for(String temp : res) {
            Pattern pp = Pattern.compile("(.+?)\"");//通过正则二次过滤
            Matcher mm = pp.matcher(temp);
            if (mm.find()) {
                result.add(mm.group(1));
            }
        }
        return result;
    }

//3.从链接里面下载一张图片
    public static void downloadJPG(String pngAdress,String storagePath) {
        URL url = null;
        int state = 0;
        //从网络上下载一张图片
        InputStream inputStream = null;
        OutputStream outputStream = null;
        //建立一个网络链接
        HttpURLConnection con = null;
        try {
            url = new URL(pngAdress);
            con = (HttpURLConnection) url.openConnection();
            inputStream = con.getInputStream();
            state = con.getResponseCode();
        } catch (Exception e) {
            System.out.println("URL不可用");
        } finally {
            if (state == 0 || state == 404) {
                return;
            }
        }try {
            outputStream = new FileOutputStream(new File(storagePath));
            int n = -1;
            byte b[] = new byte[1024];
            while ((n = inputStream.read(b)) != -1) {
                outputStream.write(b, 0, n);
            }
            outputStream.flush();
        } catch (SocketException e) {
            e.printStackTrace();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            try {
                inputStream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            try {
                outputStream.close();

            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

//4.过滤下载,即只下载图片,并在此处填本地文件夹
    public static void downloadJPG02(String website,String regurx,String ch,int i) {
        int state = 0;
        Set<String> result = null;
        try {
            result = getURLContent2(website, regurx, ch);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Pattern p = Pattern.compile("[(.jpg)(.png)（.gif)]$");//通过正则表达式再次过滤
        String storagrPath = "D:\\crawlerJPG02\\";//下载到本地文件夹
        boolean tag = false;
        for (String temp : result) {
            Matcher m = p.matcher(temp);
            if (m.find()) {
                //System.out.println(temp);
                downloadJPG(temp, storagrPath + "Imag" + i + ".jpg");
                ++i;
            } else if (!tag) {
                website = temp;
                tag = true;
            }
        }
    }

//5.主函数
    public static void main(String[] args) {
        String website = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=";
        String regurx = "\"(http[s]?://(.+?jpg)\")";
        String ch = "UTF-8";
        downloadJPG02(website+"熊猫", regurx, ch, 1);
    }
}

7.运行结果