Java爬虫历险记 -- （1）爬取百度首页的logo

最新推荐文章于 2023-11-22 17:04:22 发布

王嘉凡

最新推荐文章于 2023-11-22 17:04:22 发布

阅读量2k

点赞数 1

分类专栏： java爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/wjf_1997/article/details/78199246

版权

java爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Java爬虫历险记 – （1）爬取百度首页的logo

在这篇文章里，介绍两种方式来获取百度网页的logo：（1）Httpclient （2） jsoup + Httpclient ，详细的运行结果可以参看文章末的参考资料。代码使用的.jar包，如下图：
这里写图片描述
第一种:只使用Httpclient

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class TestOne {

    static String sendGet( String url ){

        // 定义一个字符串用来存储网页内容
        String result ="";
        // 定义一个缓冲字符输入流
        BufferedReader in = null;

        try{
            // 将string转成url对象
            URL realUrl = new URL( url );
            // 初始化一个链接到那个url的链接
            URLConnection connection = realUrl.openConnection();
            // 开始实际的连接
            connection.connect();
            // 初始化BufferedReader输入流来读取URL的响应
            in = new BufferedReader( new InputStreamReader( connection.getInputStream() ) );
            // 用来临时存储抓取的每一行数据
            String line;
            while( ( line = in.readLine() ) != null ){
                //遍历抓取到的每一行并将其存储到result里面
                result += line;
            }

        }catch(Exception e){
            System.out.println( "发送GET请求出现异常!" + e );
            e.printStackTrace();
        }
        // 使用finally来关闭输入流
        finally{
            try{
                if( in != null ){
                    in.close();
                }
            }catch(Exception e){
                e.printStackTrace();
            }
        }
        return result;
    }

    static String RegexString( String targetStr , String patternStr ){
        // 定义一个样式模板，此中使用正则表达式，括号中是要抓取的内容
        // 相当于埋好了陷阱匹配的地方就会掉下去
        Pattern pattern = Pattern.compile( patternStr );
        // 定义一个matcher用来做匹配
        Matcher matcher = pattern.matcher( targetStr );
        // 如果找到了
        if( matcher.find() ){
            return matcher.group(1);
        }
        return "Nothing";
    }

    public static String get(String url){
        String filename = "";
        String tergetUrl = "http://" + url;
        try {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(tergetUrl);
            CloseableHttpResponse response = httpclient.execute(httpGet);

            try {
                if (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                    System.out.println(response.getStatusLine());
                    HttpEntity entity = response.getEntity();
                    filename = download(entity);
                }
            } finally {
                httpclient.close();
                response.close();
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        return filename;
    }

    private static String download(HttpEntity resEntity) {
        //图片要保存的路径
        String dirPath = "d:\\pic\\";
        //图片名称，可以自定义生成
        String fileName = "b_logo.png";
        //如果没有目录先创建目录，如果没有文件名先创建文件名
        File file = new File(dirPath);
        if(file == null || !file.exists()){
            file.mkdir();
        }
        String realPath = dirPath.concat(fileName);
        File filePath = new File(realPath);
        if (filePath == null || !filePath.exists()) {
            try {
                filePath.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        //得到输入流，然后把输入流放入缓冲区中，缓冲区--->输出流flush，关闭资源
        BufferedOutputStream out = null;
        InputStream in = null;
        try {
            if (resEntity == null) {
                return null;
            }
            in = resEntity.getContent();

            out = new BufferedOutputStream(new FileOutputStream(filePath));
            byte[] bytes = new byte[1024];
            int len = -1;
            while((len = in.read(bytes)) != -1){
                out.write(bytes,0,len);
            }
            out.flush();
            out.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (IOException e) {
            }

        }
        return filePath.toString();
    }

    public static void main( String[] args ) {
        // TODO Auto-generated method stub

        // 定义即将访问的链接
        String url = "https://www.baidu.com/";

        // 访问链接并获取内容
        String result = sendGet( url );

        // 使用正则匹配图片的src内容
        String imgSrc = RegexString( result , "src=//(.+?) " );
        System.out.println( imgSrc );

        // 将图片获取到本地
        get(imgSrc);
    }

}

注意点:
（1）正则表达式匹配：”src=//(.+?) ” 在 ) 后面有一个空格，否则匹配不成功。
（2）要导入Httpclient的. jar包
（3）在get图片时候，要将其路径补充完整： String tergetUrl = “http://” + url;

第二种：jsoup + Httpclient

import java.io.*;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class TestTwo {

     public static void main(String args[]){  
            String url = "http://www.baidu.com";  
            String result = getBaiduPic(url);
            String picUrl = "Https:"+result;  
            //System.out.println(picUrl);
            downPicture(picUrl);
        }  

    public static String getBaiduPic(String url){  
        Document doc;
        String result="";
        try{  
            //从一个URL加载一个Document  
            doc = Jsoup.connect(url).get();  
            //使用选择器语法来查找元素,类似于DOM的方法，返回一组元素，如下图键值对  
            Elements ListDiv = doc.getElementsByAttributeValue("id", "lg");  
            for(Element element: ListDiv){  
                //根据标签名称在element中匹配符合条件的项  
                Elements links = element.getElementsByTag("img");  
                for(Element link: links){  
                    //取出src属性对应的值，也就是图片链接  
                    result = link.attr("src");  
                    System.out.println(result);  
                }  
            }  
        }catch(Exception e){  
            e.printStackTrace();  
        }  
        return result;
    }  

    public static void downPicture(String url){  
        java.io.InputStream inputStream = null;  
        OutputStream outputStream = null;  
        File file = new File("D://pic");  
        try {  
            //1.初始化HttpClient对象  
            CloseableHttpClient httpClient = HttpClientBuilder.create().build();  
            //2.创建一个HttpGet方法  
            HttpGet httpGet = new HttpGet(url);  
            //3.执行请求  
            HttpResponse response = httpClient.execute(httpGet);  
            //4.获取返回状态码    
            int returnCode = response.getStatusLine().getStatusCode();  
            if(returnCode == 200){  
                //创建文件夹  
                file.mkdir();  
                HttpEntity entity = response.getEntity();  
                //初始化inputStream  
                inputStream = entity.getContent();  
                outputStream = new FileOutputStream(new File("D://pic//LOGO.jpg"));  
                int temp = -1;  
                while((temp = inputStream.read())!=-1){  
                    outputStream.write(temp);  
                }  
                httpGet.abort();  
            }  
        }catch(Exception e){  
            e.printStackTrace();  
        }finally{  
            if(inputStream!=null){  
                try {  
                    inputStream.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  
            if(outputStream!=null){  
                try {  
                    outputStream.close();  
                } catch (IOException e) {  
                    e.printStackTrace();  
                }  
            }  

        }  
    }  

}

注意点：
（1）在downPicture（）中，java.io.InputStream inputStream = null; 由于 HttpEntity entity = response.getEntity(); 的影响。

参考资料:
（1）行走江湖的少侠哥 – 第2节—小任务,爬取百度LOGO链接并下载图片： http://blog.csdn.net/sinat_32588261/article/details/72287108
（2）Mr_river – Java简单爬虫系列： https://my.oschina.net/u/2519530/blog/597359
（3）汪海的实验室 – [Java]知乎下巴第1集：爬虫世界百度不仅仅可以拿来测网速 : http://blog.csdn.net/pleasecallmewhy/article/details/17594303
（这个博客可以看下讨论区）