java爬取美女图片

1. 爬取概述

通过java爬取图片资源,解析获取url,批量下载到本地。

2. 实现验证

HttpClientUtils

package com.zrj.unit.reptile;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 客户端工具
 *
 * @author zrj
 * @since 2021/9/9
 **/
public class HttpClientUtils {

    public static Map<String, List<String>> convertHeaders(Header[] headers) {
        Map<String, List<String>> results = new HashMap<String, List<String>>();
        for (Header header : headers) {
            List<String> list = results.get( header.getName() );
            if (list == null) {
                list = new ArrayList<String>();
                results.put( header.getName(), list );
            }
            list.add( header.getValue() );
        }
        return results;
    }

    /**
     * http的get请求
     *
     * @param url
     */
    public static String get(String url) {
        return get( url, "UTF-8" );
    }

    /**
     * http的get请求
     *
     * @param url
     */
    public static String get(String url, String charset) {
        HttpGet httpGet = new HttpGet( url );
        return executeRequest( httpGet, charset );
    }

    /**
     * http的get请求,增加异步请求头参数
     *
     * @param url
     */
    public static String ajaxGet(String url) {
        return ajaxGet( url, "UTF-8" );
    }

    /**
     * http的get请求,增加异步请求头参数
     *
     * @param url
     */
    public static String ajaxGet(String url, String charset) {
        HttpGet httpGet = new HttpGet( url );
        httpGet.setHeader( "X-Requested-With", "XMLHttpRequest" );
        return executeRequest( httpGet, charset );
    }

    /**
     * @param url
     * @return
     */
    public static String ajaxGet(CloseableHttpClient httpclient, String url) {
        HttpGet httpGet = new HttpGet( url );
        httpGet.setHeader( "X-Requested-With", "XMLHttpRequest" );
        return executeRequest( httpclient, httpGet, "UTF-8" );
    }

    /**
     * http的post请求,传递map格式参数
     */
    public static String post(String url, Map<String, String> dataMap) {
        return post( url, dataMap, "UTF-8" );
    }

    /**
     * http的post请求,传递map格式参数
     */
    public static String post(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost( url );
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<NameValuePair>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add( new BasicNameValuePair( entry.getKey(), entry.getValue() ) );
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity( nvps, charset );
                formEntity.setContentEncoding( charset );
                httpPost.setEntity( formEntity );
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest( httpPost, charset );
    }

    /**
     * http的post请求,增加异步请求头参数,传递map格式参数
     */
    public static String ajaxPost(String url, Map<String, String> dataMap) {
        return ajaxPost( url, dataMap, "UTF-8" );
    }

    /**
     * http的post请求,增加异步请求头参数,传递map格式参数
     */
    public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
        HttpPost httpPost = new HttpPost( url );
        httpPost.setHeader( "X-Requested-With", "XMLHttpRequest" );
        try {
            if (dataMap != null) {
                List<NameValuePair> nvps = new ArrayList<NameValuePair>();
                for (Map.Entry<String, String> entry : dataMap.entrySet()) {
                    nvps.add( new BasicNameValuePair( entry.getKey(), entry.getValue() ) );
                }
                UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity( nvps, charset );
                formEntity.setContentEncoding( charset );
                httpPost.setEntity( formEntity );
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return executeRequest( httpPost, charset );
    }

    /**
     * http的post请求,增加异步请求头参数,传递json格式参数
     */
    public static String ajaxPostJson(String url, String jsonString) {
        return ajaxPostJson( url, jsonString, "UTF-8" );
    }

    /**
     * http的post请求,增加异步请求头参数,传递json格式参数
     */
    public static String ajaxPostJson(String url, String jsonString, String charset) {
        HttpPost httpPost = new HttpPost( url );
        httpPost.setHeader( "X-Requested-With", "XMLHttpRequest" );

        StringEntity stringEntity = new StringEntity( jsonString, charset );// 解决中文乱码问题
        stringEntity.setContentEncoding( charset );
        stringEntity.setContentType( "application/json" );
        httpPost.setEntity( stringEntity );
        return executeRequest( httpPost, charset );
    }

    /**
     * 执行一个http请求,传递HttpGet或HttpPost参数
     */
    public static String executeRequest(HttpUriRequest httpRequest) {
        return executeRequest( httpRequest, "UTF-8" );
    }

    /**
     * 执行一个http请求,传递HttpGet或HttpPost参数
     */
    public static String executeRequest(HttpUriRequest httpRequest, String charset) {
        CloseableHttpClient httpclient;
        if ("https".equals( httpRequest.getURI().getScheme() )) {
            httpclient = createSSLInsecureClient();
        } else {
            httpclient = HttpClients.createDefault();
        }
        String result = "";
        try {
            try {
                CloseableHttpResponse response = httpclient.execute( httpRequest );
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString( entity, charset );
                } finally {
                    EntityUtils.consume( entity );
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }

    public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {
        String result = "";
        try {
            try {
                CloseableHttpResponse response = httpclient.execute( httpRequest );
                HttpEntity entity = null;
                try {
                    entity = response.getEntity();
                    result = EntityUtils.toString( entity, charset );
                } finally {
                    EntityUtils.consume( entity );
                    response.close();
                }
            } finally {
                httpclient.close();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        return result;
    }

    /**
     * 创建 SSL连接
     */
    public static CloseableHttpClient createSSLInsecureClient() {
        try {
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial( new TrustStrategy() {
                @Override
                public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    return true;
                }
            } ).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory( sslContext, new HostnameVerifier() {
                @Override
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            } );
            return HttpClients.custom().setSSLSocketFactory( sslsf ).build();
        } catch (GeneralSecurityException ex) {
            throw new RuntimeException( ex );
        }
    }
}

PipelineImage

package com.zrj.unit.reptile;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * 图片下载
 *
 * @author zrj
 * @since 2021/9/9
 **/
public class PipelineImage {
    private String extension = ".jpg";
    private String path = "";

    private volatile AtomicInteger suc;
    private volatile AtomicInteger fails;

    public PipelineImage() {
        setPath( "D:/pipeline/sougou" );
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public PipelineImage(String path) {
        setPath( path );
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public PipelineImage(String path, String extension) {
        setPath( path );
        this.extension = extension;
        suc = new AtomicInteger();
        fails = new AtomicInteger();
    }

    public void setPath(String path) {
        this.path = path;
    }

    /**
     * 下载
     */
    private void downloadImg(String url, String cate, String name) throws Exception {
        String path = this.path + "/" + cate + "/";
        File dir = new File( path );
        if (!dir.exists()) {    // 目录不存在则创建目录
            dir.mkdirs();
        }
        String realExt = url.substring( url.lastIndexOf( "." ) );   // 获取扩展名
        String fileName = name + realExt;
        fileName = fileName.replace( "-", "" );
        String filePath = path + fileName;
        File img = new File( filePath );
        if (img.exists()) {   // 若文件之前已经下载过,则跳过
            System.out.println( String.format( "文件%s已存在本地目录", fileName ) );
            return;
        }

        URLConnection con = new URL( url ).openConnection();
        con.setConnectTimeout( 5000 );
        con.setReadTimeout( 5000 );
        InputStream inputStream = con.getInputStream();
        byte[] bs = new byte[1024];

        File file = new File( filePath );
        FileOutputStream os = new FileOutputStream( file, true );
        // 开始读取 写入
        int len;
        while ((len = inputStream.read( bs )) != -1) {
            os.write( bs, 0, len );
        }
        System.out.println( "filePath: " + filePath );
        System.out.println( "picUrl: " + url );
        System.out.println( String.format( "正在下载第%s张图片", suc.getAndIncrement() ) );
    }

    /**
     * 单线程处理
     */
    public void process(List<String> data, String word) {
        long start = System.currentTimeMillis();
        for (String picUrl : data) {
            if (picUrl == null) {
                continue;
            }
            try {
                downloadImg( picUrl, word, picUrl );
            } catch (Exception e) {
                fails.incrementAndGet();
            }
        }
        System.out.println( "下载成功: " + suc.get() );
        System.out.println( "下载失败: " + fails.get() );
        long end = System.currentTimeMillis();
        System.out.println( "耗时:" + (end - start) / 1000 + "秒" );
    }

    /**
     * 多线程处理
     */
    public void processSync(List<String> data, String word) {
        long start = System.currentTimeMillis();
        int count = 0;
        ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池
        for (int i = 0; i < data.size(); i++) {
            String picUrl = data.get( i );
            if (picUrl == null) {
                continue;
            }
            String name = "";
            if (i < 10) {
                name = "000" + i;
            } else if (i < 100) {
                name = "00" + i;
            } else if (i < 1000) {
                name = "0" + i;
            }
            String finalName = name;
            executorService.execute( () -> {
                try {
                    downloadImg( picUrl, word, finalName );
                } catch (Exception e) {
                    fails.incrementAndGet();
                }
            } );
            count++;
        }
        executorService.shutdown();
        try {
            if (!executorService.awaitTermination( 60, TimeUnit.SECONDS )) {
                // 超时的时候向线程池中所有的线程发出中断(interrupted)。
                // executorService.shutdownNow();
            }
            System.out.println( "AwaitTermination Finished" );
            System.out.println( "下载成功: " + suc );
            System.out.println( "下载失败: " + fails );

            File dir = new File( this.path + "/" + word + "/" );
            int len = Objects.requireNonNull( dir.list() ).length;
            System.out.println( "当前共有文件: " + len );

            long end = System.currentTimeMillis();
            System.out.println( "耗时:" + (end - start) / 1000.0 + "秒" );
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

    }

    /**
     * 多线程分段处理
     */
    public void processSync2(List<String> data, final String word, int threadNum) {
        if (data.size() < threadNum) {
            process( data, word );
        } else {
            ExecutorService executorService = Executors.newCachedThreadPool();
            int num = data.size() / threadNum;    //每段要处理的数量
            for (int i = 0; i < threadNum; i++) {
                int start = i * num;
                int end = (i + 1) * num;
                if (i == threadNum - 1) {
                    end = data.size();
                }
                final List<String> cutList = data.subList( start, end );
                executorService.execute( () -> process( cutList, word ) );
            }
            executorService.shutdown();
        }
    }
}

ReptileProcessor

package com.zrj.unit.reptile;

import com.alibaba.fastjson.JSONObject;

import java.util.ArrayList;
import java.util.List;

/**
 * 图片抓取
 *
 * @author zrj
 * @since 2021/9/9
 **/
public class ReptileProcessor {
    private String url;
    private PipelineImage pipeline;
    private List<JSONObject> dataList;
    private List<String> urlList;
    private String word;

    public ReptileProcessor(String url, String word) {
        this.url = url;
        this.word = word;
        this.pipeline = new PipelineImage();
        this.dataList = new ArrayList<>();
        this.urlList = new ArrayList<>();
    }

    /**
     * 测试抓取图片
     */
    public static void main(String[] args) {
        String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
        ReptileProcessor processor = new ReptileProcessor( url, "美女" );

        //int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量
        int start = 0, size = 5, limit = 10; // 定义爬取开始索引、每次爬取数量、总共爬取数量

        for (int i = start; i < start + limit; i += size) {
            processor.process( i, size );
        }

        processor.pipelineData();

    }

    /**
     * 抓取解析页面,获取url
     */
    public void process(int idx, int size) {
        String res = HttpClientUtils.get( String.format( this.url, idx, size, this.word ) );
        JSONObject object = JSONObject.parseObject( res );
        List<JSONObject> items = (List<JSONObject>) ((JSONObject) object.get( "data" )).get( "items" );
        for (JSONObject item : items) {
            this.urlList.add( item.getString( "picUrl" ) );
        }
        this.dataList.addAll( items );
    }

    /**
     * 根据解析后的URL下载
     */
    public void pipelineData() {
        pipeline.processSync( this.urlList, this.word );
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值